diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2acfd08051c..5e4f4bfedb6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
 option(ONEDPL_ENABLE_SIMD "Enable SIMD vectorization by passing an OpenMP SIMD flag to the compiler if supported" ON)
 option(ONEDPL_CMAKE_QUIET_CHECKS "Silence output from compiler/header checks during configuration" ON)
+option(ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT "Compatibility mode for result of oneapi::dpl::ranges::set_difference with C++23" OFF)
 cmake_dependent_option(ONEDPL_TEST_WIN_ICX_FIXES "Enable icx workarounds for Windows" ON "CMAKE_HOST_WIN32;NOT _onedpl_is_subproject" OFF)
 
 file(READ ${CMAKE_CURRENT_SOURCE_DIR}/include/oneapi/dpl/internal/version_impl.h
@@ -151,6 +152,9 @@ endif()
 add_library(oneDPL INTERFACE)
 target_compile_features(oneDPL INTERFACE cxx_std_17)
 target_compile_definitions(oneDPL INTERFACE $<$<CONFIG:Debug>:PSTL_USE_DEBUG=1>)
+target_compile_definitions(oneDPL INTERFACE
+    $<$<BOOL:${ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT}>:ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT=1>
+)
 
 if (CMAKE_BUILD_TYPE)
     message(STATUS "Build type is ${CMAKE_BUILD_TYPE}")
diff --git a/include/oneapi/dpl/internal/common_config.h b/include/oneapi/dpl/internal/common_config.h
index 0863e815972..43b22d5a6e3 100644
--- a/include/oneapi/dpl/internal/common_config.h
+++ b/include/oneapi/dpl/internal/common_config.h
@@ -68,4 +68,8 @@
 #    endif
 #endif // __cplusplus >= 201703L
 
+#ifndef ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT
+#    define ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT 0
+#endif
+
 #endif // _ONEDPL_COMMON_CONFIG_H
diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h
index 44482f0f49c..8062b9fc634 100644
--- a/include/oneapi/dpl/pstl/algorithm_impl.h
+++ b/include/oneapi/dpl/pstl/algorithm_impl.h
@@ -23,6 +23,8 @@
 #include <cassert>
 #include <cmath>
 #include <tuple>
+#include <array>    // for std::array
+#include <optional> // for std::optional
 
 #include "algorithm_fwd.h"
 
@@ -3281,136 +3283,828 @@ __pattern_includes(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _
     });
 }
 
-inline constexpr auto __set_algo_cut_off = 1000;
+template <typename Size>
+constexpr bool
+__is_set_algo_cutoff_exceeded(Size size)
+{
+    // 1000 is chosen as a cut-off value based on benchmarking source data sizes
+    constexpr Size __set_algo_cut_off = 1000;
+    return size > __set_algo_cut_off;
+}
 
-template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _RandomAccessIterator2,
-          class _OutputIterator, class _SizeFunction, class _SetOP, class _Compare, class _Proj1, class _Proj2>
-_OutputIterator
-__parallel_set_op(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1,
-                  _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2,
-                  _OutputIterator __result, _SizeFunction __size_func, _SetOP __set_op, _Compare __comp, _Proj1 __proj1,
-                  _Proj2 __proj2)
+// _ReachedOffset - describes reached offset in input range
+//  - the first field contains the amount of processed items
+//  - the second field contains the amount of processed (i.e. skipped) items in the end
+template <typename _DifferenceType>
+using _ReachedOffset = std::pair<_DifferenceType, _DifferenceType>;
+
+template <typename _DifferenceType>
+struct _DataPart
 {
-    using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag;
+    //                                       [.........................)
+    // Temporary windowed buffer:        TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
+    //                                       ^                         ^
+    //                                       +<-(__buf_pos)            +<-(__buf_pos + __len)
+    //                                       |                         |
+    //                                       +--+                      +--+
+    //                                          |                         |
+    //                                          |<-(__pos)                |<-(__pos + __len)
+    //                                          V                         V
+    // Result buffer:                 OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
+
+    _DifferenceType __pos{};     // Offset in output range w/o limitation to output data size
+    _DifferenceType __len{};     // The length of data pack: the same for windowed and result buffers
+    _DifferenceType __buf_pos{}; // Offset in temporary buffer w/o limitation to output data size
+
+    bool
+    empty() const
+    {
+        return __len == 0;
+    }
+
+    static bool
+    is_left(const _DataPart& __a, const _DataPart& __b)
+    {
+        return __b.__buf_pos > __a.__buf_pos || (__b.__buf_pos == __a.__buf_pos && !__b.empty());
+    }
+
+    static _DataPart
+    combine_with(const _DataPart& __a, const _DataPart& __b)
+    {
+        return is_left(__a, __b) ? _DataPart{__a.__pos + __a.__len + __b.__pos, __b.__len, __b.__buf_pos}
+                                 : _DataPart{__b.__pos + __b.__len + __a.__pos, __a.__len, __a.__buf_pos};
+    }
 
+    bool
+    is_output_size_reached(_DifferenceType __n_out) const
+    {
+        const _DifferenceType __n_out_idx = std::max(__n_out, _DifferenceType{1}) - 1; // to handle zero output size case
+
+        //                           (1).__buf_pos   (2).__buf_pos   (3).__buf_pos   (4).__buf_pos   (5).__buf_pos   (5).__buf_pos   (6).__buf_pos   (7).__buf_pos
+        //                              |               |               |               |               |               |               |               |
+        //                              V-----------)   V-------)       V-----------)   V-)             V----------)    V----)          V--)            V-)
+        // Temporary buffer: [..............................................................................................................................)
+        //
+        //                               (2).__pos       (2).__pos + _len               (5).__pos       (5).__pos + (5).__len
+        //                                  |               |                              |               |
+        //                                  V               V                              V               V
+        // Result buffer:    [.......................)................................................X.............................
+        //                                           ^                                                ^
+        //                                           |                                                |
+        // Positions in result buffer:             __n_out_idx                                      __n_out_idx + 1
+
+        return __pos <= __n_out_idx && __n_out_idx < __pos + __len;
+    }
+};
+
+template <typename _DifferenceType>
+struct _SrcDataProcessingOffset
+{
+    _DifferenceType __offset = {}; // Offset in input range to processing data
+    _DifferenceType __length = {}; // Length of processing data
+};
+
+template <typename _DifferenceType1, typename _DifferenceType2>
+struct _SrcDataProcessingOffsets
+{
+    _SrcDataProcessingOffset<_DifferenceType1> __in1;
+    _SrcDataProcessingOffset<_DifferenceType2> __in2;
+};
+
+template <typename _DifferenceType1, typename _DifferenceType2>
+struct _SrcProcessedDataAmount
+{
+    _DifferenceType1 __length1 = {}; // Amount of processed data in the first input range
+    _DifferenceType2 __length2 = {}; // Amount of processed data in the second input range
+
+    static _SrcProcessedDataAmount
+    combine_with(const _SrcProcessedDataAmount& __a, const _SrcProcessedDataAmount& __b)
+    {
+        return _SrcProcessedDataAmount{std::max(__a.__length1, __b.__length1), std::max(__a.__length2, __b.__length2)};
+    }
+};
+
+// Describes a data window in the temporary buffer and corresponding positions in the output range
+template <bool __Bounded, typename _DifferenceType1, typename _DifferenceType2, typename _DifferenceTypeOut,
+          typename _DifferenceTypeMask>
+struct _SetRangeImpl
+{
+    static constexpr std::size_t _DataIndex = 0;
+    static constexpr std::size_t _SrcOffsetsIndex = 1;
+    static constexpr std::size_t _SrcProcessedDataIndex = 2;
+
+    using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2, _DifferenceTypeOut>;
+
+    using _DataStorage = std::conditional_t<
+        !__Bounded, _DataPart<_DifferenceType>,
+        std::tuple<_DataPart<_DifferenceType>, _SrcDataProcessingOffsets<_DifferenceType1, _DifferenceType2>,
+                   _SrcProcessedDataAmount<_DifferenceType1, _DifferenceType2>>>;
+
+    _DataStorage __data;
+
+    const _DataPart<_DifferenceType>&
+    get_data_part() const
+    {
+        if constexpr (!__Bounded)
+            return __data;
+        else
+            return std::get<_DataIndex>(__data);
+    }
+
+    const _SrcDataProcessingOffsets<_DifferenceType1, _DifferenceType2>&
+    get_src_offsets_part() const
+    {
+        static_assert(__Bounded, "Source data offsets part is available only for bounded set operations");
+        return std::get<_SrcOffsetsIndex>(__data);
+    }
+
+    const _SrcProcessedDataAmount<_DifferenceType1, _DifferenceType2>&
+    get_src_processed_data_amount_part() const
+    {
+        static_assert(__Bounded, "Source data processed amount part is available only for bounded set operations");
+        return std::get<_SrcProcessedDataIndex>(__data);
+    }
+
+    static _SetRangeImpl
+    combine_with(const _SetRangeImpl& __a, const _SetRangeImpl& __b)
+    {
+        auto __new_data_part = _DataPart<_DifferenceType>::combine_with(__a.get_data_part(), __b.get_data_part());
+
+        if constexpr (!__Bounded)
+        {
+            return _SetRangeImpl{__new_data_part};
+        }
+        else
+        {
+            typename _SetRangeImpl::_DataStorage __ds{
+                __new_data_part,
+                _DataPart<_DifferenceType>::is_left(__a.get_data_part(), __b.get_data_part())
+                    ? __b.get_src_offsets_part()
+                    : __a.get_src_offsets_part(),
+                _SrcProcessedDataAmount<_DifferenceType1, _DifferenceType2>::combine_with(
+                    __a.get_src_processed_data_amount_part(), __b.get_src_processed_data_amount_part())};
+            return _SetRangeImpl{__ds};
+        }
+    }
+};
+
+struct _ParallelSetOpCombinePred
+{
+    template <bool __Bounded, typename _DifferenceType1, typename _DifferenceType2, typename _DifferenceTypeMask,
+              typename _DifferenceTypeOut>
+    _SetRangeImpl<__Bounded, _DifferenceType1, _DifferenceType2, _DifferenceTypeOut, _DifferenceTypeMask>
+    operator()(const _SetRangeImpl<__Bounded, _DifferenceType1, _DifferenceType2, _DifferenceTypeOut,
+                                   _DifferenceTypeMask>& __a,
+               const _SetRangeImpl<__Bounded, _DifferenceType1, _DifferenceType2, _DifferenceTypeOut,
+                                   _DifferenceTypeMask>& __b) const
+    {
+        return _SetRangeImpl<__Bounded, _DifferenceType1, _DifferenceType2, _DifferenceTypeOut,
+                             _DifferenceTypeMask>::combine_with(__a, __b);
+    }
+};
+
+template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _OutputIterator>
+using __parallel_set_op_return_t =
+    oneapi::dpl::__utils::__set_operations_result<_RandomAccessIterator1, _RandomAccessIterator2, _OutputIterator>;
+
+template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _RandomAccessIterator2,
+          class _OutputIterator, typename _Compare, typename _Proj1, typename _Proj2, typename _SetUnionOp,
+          class _SizeFunction, class _MaskSizeFunction, typename _SetRange, bool __Bounded>
+struct _SetOpReachedPosEvaluator
+{
     using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type;
     using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type;
-    using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2>;
-    using _T = typename std::iterator_traits<_OutputIterator>::value_type;
+    using _DifferenceTypeOut = typename std::iterator_traits<_OutputIterator>::difference_type;
+
+    using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2, _DifferenceTypeOut>;
+
+    using _SetOpReachedPosEvaluatorData = std::tuple<_DifferenceType1, _DifferenceType2, _DifferenceTypeOut>;
+
+    _SetOpReachedPosEvaluator(__parallel_tag<_IsVector> __tag, _ExecutionPolicy& __exec,
+                              _RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1,
+                              _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2,
+                              _OutputIterator __result1, _OutputIterator __result2, _Compare __comp, _Proj1 __proj1,
+                              _Proj2 __proj2, _SetUnionOp __set_union_op, _SizeFunction __size_func,
+                              _MaskSizeFunction __mask_size_func)
+        : __tag(__tag), __exec(__exec), __first1(__first1), __last1(__last1), __first2(__first2), __last2(__last2),
+          __result1(__result1), __result2(__result2), __comp(__comp), __proj1(__proj1), __proj2(__proj2),
+          __set_union_op(__set_union_op), __size_func(__size_func), __mask_size_func(__mask_size_func),
+          __n_out(__result2 - __result1)
+    {
+    }
+
+    void
+    __on_output_size_reached(std::size_t __offset_from_n_out, const _DataPart<_DifferenceType>& __data_part,
+                             const _SrcDataProcessingOffsets<_DifferenceType1, _DifferenceType2>& __source_data_offsets)
+    {
+        assert(__offset_from_n_out < 2);
+
+        __output_size_reached_info_opt[__offset_from_n_out] = OutputSizeReachedInfo{__data_part, __source_data_offsets};
+
+        // Reset reached positions in the output and input ranges due to they will be evaluated based on the information about output size reached point
+        __res_data_opt.reset();
+    }
+
+    void
+    __on_apex(const _SetRange& __total)
+    {
+        __apex_total = __total;
+
+        // Reset reached positions in the output and input ranges due to they will be evaluated based on the information about output size reached point
+        __res_data_opt.reset();
+    }
+
+    // Get evaluated reached positions for the source ranges and output range
+    _SetOpReachedPosEvaluatorData
+    __get_reached_positions()
+    {
+        if (!__res_data_opt.has_value())
+        {
+            const _DataPart<_DifferenceType>& __apex_total_data_part = __apex_total.get_data_part();
+
+            const std::pair<_DifferenceType1, _DifferenceType2> __input_reached_positions =
+                __eval_reached_input_positions();
+
+            __res_data_opt.emplace(__input_reached_positions.first, __input_reached_positions.second,
+                                   std::min(__apex_total_data_part.__pos + __apex_total_data_part.__len, __n_out));
+        }
 
-    struct _SetRange
+        return __res_data_opt.value();
+    }
+
+  protected:
+    struct _NoopConstruct
     {
-        _DifferenceType __pos, __len, __buf_pos;
-        bool
-        empty() const
+        template <typename _ForwardIterator>
+        std::nullptr_t
+        operator()(_ForwardIterator, _ForwardIterator, std::nullptr_t)
         {
-            return __len == 0;
+            return nullptr;
         }
     };
 
-    const _DifferenceType1 __n1 = __last1 - __first1;
-    const _DifferenceType2 __n2 = __last2 - __first2;
+    template <typename _DifferenceTypeArg>
+    _DifferenceTypeArg
+    __eval_reached_pos(oneapi::dpl::__utils::__parallel_set_op_mask* __mask_buffer_begin,
+                       oneapi::dpl::__utils::__parallel_set_op_mask* __mask_buffer_end,
+                       oneapi::dpl::__utils::__parallel_set_op_mask __dest_data_mask_state, _DifferenceTypeOut __pos_no,
+                       _DifferenceTypeArg __reached_pos) const
+    {
+        assert(__dest_data_mask_state == oneapi::dpl::__utils::__parallel_set_op_mask::eData1 ||
+               __dest_data_mask_state == oneapi::dpl::__utils::__parallel_set_op_mask::eData2);
 
-    __par_backend::__buffer<_T> __buf(__size_func(__n1, __n2));
-
-    return __internal::__except_handler([&__exec, __n1, __first1, __last1, __first2, __last2, __result, __size_func,
-                                         __set_op, &__buf, __comp, __proj1, __proj2]() {
-        auto __tmp_memory = __buf.get();
-        _DifferenceType1 __m{};
-        auto __scan = [=](_DifferenceType1, _DifferenceType1, const _SetRange& __s) { // Scan
-            if (!__s.empty())
-                __brick_move_destroy<__parallel_tag<_IsVector>>{}(__tmp_memory + __s.__buf_pos,
-                                                                  __tmp_memory + (__s.__buf_pos + __s.__len),
-                                                                  __result + __s.__pos, _IsVector{});
-        };
-        __par_backend::__parallel_strict_scan(
-            __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n1, _SetRange{0, 0, 0},
-            [=](_DifferenceType1 __i, _DifferenceType1 __len) { // Reduce
-                //[__b; __e) - a subrange of the first sequence, to reduce
-                _RandomAccessIterator1 __b = __first1 + __i;
-                _RandomAccessIterator1 __e = __first1 + (__i + __len);
-
-                //try searching for the first element which not equal to *__b
-                if (__b != __first1)
-                    __b += __internal::__pstl_upper_bound(__b, _DifferenceType1{0}, __last1 - __b, __b, __comp, __proj1, __proj1);
-
-                //try searching for the first element which not equal to *__e
-                if (__e != __last1)
-                    __e += __internal::__pstl_upper_bound(__e, _DifferenceType1{0}, __last1 - __e, __e, __comp, __proj1, __proj1);
-
-                //check is [__b; __e) empty
-                if (__e - __b < 1)
-                {
-                    _RandomAccessIterator2 __bb = __last2;
-                    if (__b != __last1)
-                        __bb = __first2 + __internal::__pstl_lower_bound(__first2, _DifferenceType2{0}, __last2 - __first2,
-                                                                      __b, __comp, __proj2, __proj1);
+        auto __mask_buffer_it = __mask_buffer_begin;
 
-                    const _DifferenceType __buf_pos = __size_func((__b - __first1), (__bb - __first2));
-                    return _SetRange{0, 0, __buf_pos};
-                }
+        for (; __mask_buffer_it != __mask_buffer_end && __pos_no < __n_out; ++__mask_buffer_it)
+        {
+            auto __state = *__mask_buffer_it;
 
-                //try searching for "corresponding" subrange [__bb; __ee) in the second sequence
-                _RandomAccessIterator2 __bb = __first2;
-                if (__b != __first1)
-                    __bb = __first2 + __internal::__pstl_lower_bound(__first2, _DifferenceType2{0}, __last2 - __first2,
-                                                                     __b, __comp, __proj2, __proj1);
+            __pos_no += __test_mask(oneapi::dpl::__utils::__parallel_set_op_mask::eDataOut, __state) ? 1 : 0;
+            __reached_pos += __test_mask(__dest_data_mask_state, __state) ? 1 : 0;
+        }
 
-                _RandomAccessIterator2 __ee = __last2;
-                if (__e != __last1)
-                    __ee = __bb + __internal::__pstl_lower_bound(__bb, _DifferenceType2{0}, __last2 - __bb, __e, __comp,
-                                                                 __proj2, __proj1);
+        // 2. Pass positions which not generates output
+        for (; __mask_buffer_it != __mask_buffer_end; ++__mask_buffer_it)
+        {
+            auto __state = *__mask_buffer_it;
 
-                const _DifferenceType __buf_pos = __size_func((__b - __first1), (__bb - __first2));
-                auto __buffer_b = __tmp_memory + __buf_pos;
-                auto __res = __set_op(__b, __e, __bb, __ee, __buffer_b, __comp, __proj1, __proj2);
+            // Breaks if we detected mask which describes output data generation from specified data set
+            if (__test_mask(oneapi::dpl::__utils::__parallel_set_op_mask::eDataOut, __state))
+                break;
 
-                return _SetRange{0, __res - __buffer_b, __buf_pos};
-            },
-            [](const _SetRange& __a, const _SetRange& __b) { // Combine
-                if (__b.__buf_pos > __a.__buf_pos || ((__b.__buf_pos == __a.__buf_pos) && !__b.empty()))
-                    return _SetRange{__a.__pos + __a.__len + __b.__pos, __b.__len, __b.__buf_pos};
-                return _SetRange{__b.__pos + __b.__len + __a.__pos, __a.__len, __a.__buf_pos};
-            },
-            __scan,                                     // Scan
-            [&__m, &__scan](const _SetRange& __total) { // Apex
-                //final scan
-                __scan(0, 0, __total);
-                __m = __total.__pos + __total.__len;
-            });
-        return __result + __m;
+            __reached_pos += __test_mask(__dest_data_mask_state, __state) ? 1 : 0;
+        }
+
+        return __reached_pos;
+    }
+
+    template <bool _IsFirstRange, typename _DifferenceType1, typename _DifferenceType2>
+    const _SrcDataProcessingOffset<std::conditional_t<_IsFirstRange, _DifferenceType1, _DifferenceType2>>&
+    __get_source_data_offset_part(
+        const _SrcDataProcessingOffsets<_DifferenceType1, _DifferenceType2>& __src_offsets_part) const
+    {
+        if constexpr (_IsFirstRange)
+            return __src_offsets_part.__in1;
+        else
+            return __src_offsets_part.__in2;
+    }
+
+    template <bool _IsFirstRange, typename _RandomAccessIterator,
+              typename _DifferenceType = typename std::iterator_traits<_RandomAccessIterator>::difference_type>
+    std::pair<_DifferenceType, _DifferenceType>
+    __eval_offset_and_size(_RandomAccessIterator __first, _RandomAccessIterator __last) const
+    {
+        _DifferenceType __offset = 0;
+        _DifferenceType __length = __last - __first;
+
+        assert(__output_size_reached_info_opt[0].has_value());
+
+        const auto& __offset_part_n0 =
+            __get_source_data_offset_part<_IsFirstRange>(__output_size_reached_info_opt[0].value().__src_offsets_part);
+        __offset = __offset_part_n0.__offset;
+        __length = __offset_part_n0.__length;
+        assert(__offset + __length <= __last - __first);
+
+        if (__output_size_reached_info_opt[1].has_value())
+        {
+            const auto& __offset_part_n1 = __get_source_data_offset_part<_IsFirstRange>(
+                __output_size_reached_info_opt[1].value().__src_offsets_part);
+            _DifferenceType __offset_n1 = __offset_part_n1.__offset;
+            _DifferenceType __length_n1 = __offset_part_n1.__length;
+
+            if (__offset_n1 + __length_n1 > __offset + __length)
+            {
+                // Process till the end of the second data part
+                __length = __offset_n1 + __length_n1 - __offset;
+            }
+        }
+
+        return {__offset, __length};
+    }
+
+    std::pair<_DifferenceType1, _DifferenceType2>
+    __eval_reached_input_positions() const
+    {
+        if constexpr (!__Bounded)
+        {
+            // In not bounded set operation we don't have real output size reached point,
+            // so just return the amounts of processed data in input ranges which are equal to input ranges sizes
+            return {__last1 - __first1, __last2 - __first2};
+        }
+        else
+        {
+            // In bounded set operation when we don't reached output size limit, we can process all data in input ranges,
+            // so return the amounts of processed data in input ranges which are equal to input ranges sizes
+            if (!__output_size_reached_info_opt[0].has_value())
+            {
+                const _SrcProcessedDataAmount<_DifferenceType1, _DifferenceType2>& __src_processed =
+                    __apex_total.get_src_processed_data_amount_part();
+                return {__src_processed.__length1, __src_processed.__length2};
+            }
+
+            // Create & fill buffer with mask
+            const auto [__offset1, __size1] = __eval_offset_and_size<true>(__first1, __last1);
+            const auto [__offset2, __size2] = __eval_offset_and_size<false>(__first2, __last2);
+
+            const auto __mask_buf_size = __mask_size_func(__size1, __size2);
+
+            // We need to have initialized memory under mask buffer
+            std::vector<oneapi::dpl::__utils::__parallel_set_op_mask> __mask_bufs(
+                __mask_buf_size, oneapi::dpl::__utils::__parallel_set_op_mask::eNone);
+
+            auto [__first1_tmp_reached, __first2_tmp_reached, __output_discard_it_reached, __mask_buffer_reached] =
+                __set_union_op(
+                    __first1 + __offset1, __first1 + __offset1 + __size1, // First input range bounds
+                    __first2 + __offset2, __first2 + __offset2 + __size2, // Second input range bounds
+                    oneapi::dpl::__utils::_SetOpDiscardIterator{}, // No real output buffer, so using discard iterator
+                    __comp, __proj1, __proj2, __mask_bufs.data());
+            assert(__mask_buffer_reached - __mask_bufs.data() <= static_cast<std::ptrdiff_t>(__mask_bufs.size()));
+
+            ////////////////////////////////////////////////////////////
+            // Process data based on buffer with mask
+
+            assert(__output_size_reached_info_opt[0].has_value());
+            const OutputSizeReachedInfo& __ri_n0 = __output_size_reached_info_opt[0].value();
+
+            using __backend_tag = typename decltype(__tag)::__backend_tag;
+
+            // Calculate reached positions based on mask buffer
+            _DifferenceType1 __res_reachedPos1 = {};
+            _DifferenceType2 __res_reachedPos2 = {};
+            __par_backend::__parallel_invoke(
+                __backend_tag{}, __exec,
+                [&]() {
+                    __res_reachedPos1 = __eval_reached_pos(
+                        __mask_bufs.data(), __mask_buffer_reached, oneapi::dpl::__utils::__parallel_set_op_mask::eData1,
+                        __ri_n0.__data_part.__pos, __ri_n0.__src_offsets_part.__in1.__offset);
+                },
+                [&]() {
+                    __res_reachedPos2 = __eval_reached_pos(
+                        __mask_bufs.data(), __mask_buffer_reached, oneapi::dpl::__utils::__parallel_set_op_mask::eData2,
+                        __ri_n0.__data_part.__pos, __ri_n0.__src_offsets_part.__in2.__offset);
+                });
+
+            return {__res_reachedPos1, __res_reachedPos2};
+        }
+    }
+
+    bool
+    __test_mask(oneapi::dpl::__utils::__parallel_set_op_mask __checking_mask_state,
+                oneapi::dpl::__utils::__parallel_set_op_mask __real_mask_state) const noexcept
+    {
+        using _UT = std::underlying_type_t<oneapi::dpl::__utils::__parallel_set_op_mask>;
+
+        const _UT __state_value = static_cast<_UT>(__real_mask_state);
+
+        // The zero state is incorrect mask state!
+        assert(__state_value != 0);
+
+        // Check correct memory state
+        [[maybe_unused]] constexpr _UT __valid_bits =
+            static_cast<_UT>(oneapi::dpl::__utils::__parallel_set_op_mask::eBothOut);
+        assert((__state_value & (~__valid_bits)) == 0);
+
+        return (__state_value & static_cast<_UT>(__checking_mask_state)) == static_cast<_UT>(__checking_mask_state);
+    }
+
+  protected:
+    __parallel_tag<_IsVector> __tag;
+    _ExecutionPolicy& __exec;
+
+    _RandomAccessIterator1 __first1, __last1;
+    _RandomAccessIterator2 __first2, __last2;
+    _OutputIterator __result1, __result2;
+    _Compare __comp;
+    _Proj1 __proj1;
+    _Proj2 __proj2;
+    _SetUnionOp __set_union_op;
+    _SizeFunction __size_func;
+    _MaskSizeFunction __mask_size_func;
+
+    const _DifferenceTypeOut __n_out = {}; // Size of output range
+
+    _SetRange __apex_total;
+
+    struct OutputSizeReachedInfo
+    {
+        _DataPart<_DifferenceType> __data_part;
+        _SrcDataProcessingOffsets<_DifferenceType1, _DifferenceType2> __src_offsets_part;
+    };
+
+    // Information about two data parts which can generate output data when output size will be reached:
+    // - element 0: the part which reached output size (__n)
+    // - element 1: the part which reached output size (__n + 1)
+    std::optional<OutputSizeReachedInfo> __output_size_reached_info_opt[2];
+
+    // Reached positions in the input and output ranges
+    std::optional<_SetOpReachedPosEvaluatorData> __res_data_opt;
+};
+
+template <bool __Bounded, class _IsVector, typename _ExecutionPolicy, typename ProcessingDataPointer,
+          typename _SetRange, typename _OutputIterator, typename _SetOpReachedPosEvaluator>
+struct _ParallelSetOpScanPred
+{
+    __parallel_tag<_IsVector> __tag;
+    _ExecutionPolicy& __exec;
+    ProcessingDataPointer __buf_pos_begin, __buf_pos_end;         // Temporary data buffer (windowed)
+    _OutputIterator __result_buf_pos_begin, __result_buf_pos_end; // Result data buffer
+    _SetOpReachedPosEvaluator& __source_final_pos_evaluator; // Evaluator of the final position in the source ranges
+
+    template <typename _DifferenceType>
+    void
+    operator()(_DifferenceType, _DifferenceType, const _SetRange& __s) const
+    {
+        const _DataPart<_DifferenceType>& __data_part = __s.get_data_part();
+
+        if constexpr (!__Bounded)
+        {
+            // 1. Copy source data (unbounded)
+            __copy_data_to_result_buf(__data_part);
+        }
+        else
+        {
+            // Copy source data (bounded)
+            ProcessingDataPointer __buf_pos_start_of_not_copied = __buf_pos_begin;
+            const auto __remaining_data_size = __eval_remaining_data_size(__data_part);
+            if (__remaining_data_size > 0)
+                __buf_pos_start_of_not_copied = __copy_data_to_result_buf_bounded(__data_part, __remaining_data_size);
+
+            // Destroy not copied data
+            if (__remaining_data_size < __data_part.__len)
+                __brick_destroy(__buf_pos_start_of_not_copied, __buf_pos_end, _IsVector{});
+
+            const _DifferenceType __n_out = __result_buf_pos_end - __result_buf_pos_begin;
+
+            // Save subrange info if we reached final/after final positions at this subrange
+            for (_DifferenceType __n_offset : {0, 1})
+            {
+                if (__data_part.is_output_size_reached(__n_out + __n_offset))
+                    __source_final_pos_evaluator.__on_output_size_reached(__n_offset, __data_part,
+                                                                          __s.get_src_offsets_part());
+            }
+        }
+    }
+
+    void
+    __on_apex(const _SetRange& __total)
+    {
+        __source_final_pos_evaluator.__on_apex(__total);
+    }
+
+  protected:
+    template <typename _DifferenceType>
+    void
+    __copy_data_to_result_buf(const _DataPart<_DifferenceType>& __data_part) const
+    {
+        // Processed data
+        __brick_move_destroy<decltype(__tag)>{}(__buf_pos_begin + __data_part.__buf_pos,
+                                                __buf_pos_begin + __data_part.__buf_pos + __data_part.__len,
+                                                __result_buf_pos_begin + __data_part.__pos, _IsVector{});
+    }
+
+    template <typename _DifferenceType>
+    typename std::iterator_traits<_OutputIterator>::difference_type
+    __eval_remaining_data_size(const _DataPart<_DifferenceType>& __data_part) const
+    {
+        // Evaluate output range boundaries for current data chunk
+        const auto __result_from = __advance_clamped(__result_buf_pos_begin, __data_part.__pos, __result_buf_pos_end);
+        const auto __result_to =
+            __advance_clamped(__result_buf_pos_begin, __data_part.__pos + __data_part.__len, __result_buf_pos_end);
+
+        return __result_to - __result_from;
+    }
+
+    template <typename _DifferenceType>
+    ProcessingDataPointer
+    __copy_data_to_result_buf_bounded(const _DataPart<_DifferenceType>& __data_part,
+                                      _DifferenceType __result_remaining) const
+    {
+        // Evaluate output range boundaries for current data chunk
+        const auto __result_from = __advance_clamped(__result_buf_pos_begin, __data_part.__pos, __result_buf_pos_end);
+
+        assert(__result_remaining <= __data_part.__len);
+
+        // Evaluate pointers to current data chunk in temporary buffer
+        const auto __buf_pos_from = __advance_clamped(__buf_pos_begin, __data_part.__buf_pos, __buf_pos_end);
+        const auto __buf_pos_to =
+            __advance_clamped(__buf_pos_begin, __data_part.__buf_pos + __result_remaining, __buf_pos_end);
+
+        // Copy results data into results range to have final output
+        __brick_move_destroy<decltype(__tag)>{}(__buf_pos_from, __buf_pos_to, __result_from, _IsVector{});
+
+        return __buf_pos_to;
+    }
+
+    // Move it1 forward by n, but not beyond it2
+    template <typename _RandomAccessIterator,
+              typename Size = typename std::iterator_traits<_RandomAccessIterator>::difference_type>
+    _RandomAccessIterator
+    __advance_clamped(_RandomAccessIterator it1, Size n, _RandomAccessIterator it2) const
+    {
+        assert(it1 <= it2);
+        return it1 + std::min(it2 - it1, n);
+    }
+};
+
+template <bool __Bounded, typename _Tag, typename _ExecutionPolicy, typename _SetRange, typename _RandomAccessIterator1,
+          typename _RandomAccessIterator2, typename _OutputIterator, typename _SizeFunction, typename _MaskSizeFunction,
+          typename _SetUnionOp, typename _Compare, typename _Proj1, typename _Proj2, typename _T>
+struct _ParallelSetOpStrictReducePred
+{
+    _Tag __tag;
+    _ExecutionPolicy& __exec;
+
+    _RandomAccessIterator1 __first1, __last1;
+    _RandomAccessIterator2 __first2, __last2;
+    _SizeFunction __size_func;
+    _MaskSizeFunction __mask_size_func;
+    _SetUnionOp __set_union_op;
+
+    _Compare __comp;
+    _Proj1 __proj1;
+    _Proj2 __proj2;
+
+    _T* __buf_raw_data_begin = nullptr;
+
+    using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type;
+    using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type;
+    using _DifferenceTypeOutput = typename std::iterator_traits<_OutputIterator>::difference_type;
+    using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2, _DifferenceTypeOutput>;
+
+    _SetRange
+    operator()(_DifferenceType1 __i, _DifferenceType1 __len) const
+    {
+        //[__b; __e) - a subrange of the first sequence, to reduce
+        _RandomAccessIterator1 __b = __first1 + __i;
+        _RandomAccessIterator1 __e = __first1 + __i + __len;
+
+        //try searching for the first element which not equal to *__b
+        if (__b != __first1)
+            __b +=
+                __internal::__pstl_upper_bound(__b, _DifferenceType1{0}, __last1 - __b, __b, __comp, __proj1, __proj1);
+
+        //try searching for the first element which not equal to *__e
+        if (__e != __last1)
+            __e +=
+                __internal::__pstl_upper_bound(__e, _DifferenceType1{0}, __last1 - __e, __e, __comp, __proj1, __proj1);
+
+        //check is [__b; __e) empty
+        if (__e - __b < 1)
+        {
+            _RandomAccessIterator2 __bb = __last2;
+            if (__b != __last1)
+                __bb = __first2 + __internal::__pstl_lower_bound(__first2, _DifferenceType2{0}, __last2 - __first2, __b,
+                                                                 __comp, __proj2, __proj1);
+
+            const _DifferenceType __buf_pos = __size_func(__b - __first1, __bb - __first2);
+
+            _DataPart<_DifferenceType> __new_processing_data{0, 0, __buf_pos};
+
+            if constexpr (!__Bounded)
+            {
+                return _SetRange{__new_processing_data};
+            }
+            else
+            {
+                _SrcDataProcessingOffsets<_DifferenceType1, _DifferenceType2> __new_offsets_to_processing_data{
+                    {__b - __first1, 0}, {__bb - __first2, 0}};
+
+                _SrcProcessedDataAmount<_DifferenceType1, _DifferenceType2> __new_processed_data_amount{0, 0};
+
+                typename _SetRange::_DataStorage _ds{__new_processing_data, __new_offsets_to_processing_data,
+                                                     __new_processed_data_amount};
+
+                return _SetRange{_ds};
+            }
+        }
+
+        //try searching for "corresponding" subrange [__bb; __ee) in the second sequence
+        _RandomAccessIterator2 __bb = __first2;
+        if (__b != __first1)
+            __bb = __first2 + __internal::__pstl_lower_bound(__first2, _DifferenceType2{0}, __last2 - __first2, __b,
+                                                             __comp, __proj2, __proj1);
+
+        _RandomAccessIterator2 __ee = __last2;
+        if (__e != __last1)
+            __ee = __bb + __internal::__pstl_lower_bound(__bb, _DifferenceType2{0}, __last2 - __bb, __e, __comp,
+                                                         __proj2, __proj1);
+
+        const _DifferenceType __buf_pos = __size_func(__b - __first1, __bb - __first2);
+
+        _T* __buffer_b = __buf_raw_data_begin + __buf_pos;
+
+        auto [__it1_reached, __it2_reached, __output_reached, __mask_reached] =
+            __set_union_op(__b, __e, __bb, __ee, __buffer_b, __comp, __proj1, __proj2, nullptr);
+
+        // Prepare processed data info
+        const _DataPart<_DifferenceType> __new_processing_data{0, __output_reached - __buffer_b, __buf_pos};
+
+        if constexpr (!__Bounded)
+        {
+            return _SetRange{__new_processing_data};
+        }
+        else
+        {
+            _SrcDataProcessingOffsets<_DifferenceType1, _DifferenceType2> __new_offsets_to_processing_data{
+                {__b - __first1, __it1_reached - __b}, {__bb - __first2, __it2_reached - __bb}};
+
+            const bool __something_reached = __it1_reached != __b || __it2_reached != __bb;
+
+            _SrcProcessedDataAmount<_DifferenceType1, _DifferenceType2> __new_processed_data_amount{
+                __something_reached ? __it1_reached - __first1 : 0, __it2_reached - __first2};
+
+            typename _SetRange::_DataStorage _ds{__new_processing_data, __new_offsets_to_processing_data,
+                                                 __new_processed_data_amount};
+
+            return _SetRange{_ds};
+        }
+    }
+};
+
+template <bool __Bounded, class _IsVector, typename _ExecutionPolicy, typename ProcessingDataPointer,
+          typename _SetRange, typename _OutputIterator, typename _DifferenceType, typename _SetOpReachedPosEvaluator>
+struct _ParallelSetOpApexPred
+{
+    _ParallelSetOpScanPred<__Bounded, _IsVector, _ExecutionPolicy, ProcessingDataPointer, _SetRange, _OutputIterator,
+                           _SetOpReachedPosEvaluator>& __scan_pred;
+
+    void
+    operator()(const _SetRange& __total) const
+    {
+        //final scan
+        __scan_pred(/* 0 */ _DifferenceType{}, /* 0 */ _DifferenceType{}, __total);
+
+        __scan_pred.__on_apex(__total);
+    }
+};
+
+template <bool __Bounded, class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1,
+          class _RandomAccessIterator2, class _OutputIterator, class _Compare, class _Proj1, class _Proj2,
+          class _SizeFunction, class _MaskSizeFunction, class _SetUnionOp>
+__parallel_set_op_return_t<_RandomAccessIterator1, _RandomAccessIterator2, _OutputIterator>
+__parallel_set_op(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1,
+                  _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2,
+                  _OutputIterator __result1, _OutputIterator __result2, _Compare __comp, _Proj1 __proj1, _Proj2 __proj2,
+                  _SizeFunction __size_func, _MaskSizeFunction __mask_size_func, _SetUnionOp __set_union_op)
+{
+    using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag;
+
+    using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type;
+    using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type;
+    using _DifferenceTypeOutput = typename std::iterator_traits<_OutputIterator>::difference_type;
+    using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2, _DifferenceTypeOutput>;
+    using _T = typename std::iterator_traits<_OutputIterator>::value_type;
+
+    const _DifferenceType1 __n1 = __last1 - __first1;
+    const _DifferenceType2 __n2 = __last2 - __first2;
+
+    const _DifferenceType __buf_size = __size_func(__n1, __n2);
+    __par_backend::__buffer<_T> __buf(__buf_size); // Temporary (windowed) buffer for result preparation
+
+    using __mask_difference_type_t =
+        typename std::iterator_traits<oneapi::dpl::__utils::__parallel_set_op_mask*>::difference_type;
+
+    using _SetRange =
+        _SetRangeImpl<__Bounded, _DifferenceType1, _DifferenceType2, _DifferenceTypeOutput, __mask_difference_type_t>;
+
+    return __internal::__except_handler([__tag, &__exec, __n1, __first1, __last1, __first2, __last2, __result1,
+                                         __result2, __comp, __proj1, __proj2, __size_func, __mask_size_func,
+                                         __set_union_op, &__buf, __buf_size]() {
+        // Buffer raw data begin/end pointers
+        _T* __buf_raw_data_begin = __buf.get();
+        _T* __buf_raw_data_end = __buf_raw_data_begin + __buf_size;
+
+        _SetOpReachedPosEvaluator<_IsVector, _ExecutionPolicy, _RandomAccessIterator1, _RandomAccessIterator2,
+                                  _OutputIterator, _Compare, _Proj1, _Proj2, _SetUnionOp, _SizeFunction,
+                                  _MaskSizeFunction, _SetRange, __Bounded>
+            __source_final_pos_evaluator(__tag, __exec, __first1, __last1, __first2, __last2, __result1, __result2,
+                                         __comp, __proj1, __proj2, __set_union_op, __size_func, __mask_size_func);
+
+        // Scan predicate
+        _ParallelSetOpScanPred<__Bounded, _IsVector, _ExecutionPolicy, _T*, _SetRange, _OutputIterator,
+                               decltype(__source_final_pos_evaluator)>
+            __scan_pred{__tag,     __exec,    __buf_raw_data_begin,        __buf_raw_data_end,
+                        __result1, __result2, __source_final_pos_evaluator};
+
+        _ParallelSetOpStrictReducePred<__Bounded, __parallel_tag<_IsVector>, _ExecutionPolicy, _SetRange,
+                                       _RandomAccessIterator1, _RandomAccessIterator2, _OutputIterator, _SizeFunction,
+                                       _MaskSizeFunction, _SetUnionOp, _Compare, _Proj1, _Proj2, _T>
+            __reduce_pred{__tag,
+                          __exec,
+                          __first1,
+                          __last1,
+                          __first2,
+                          __last2,
+                          __size_func,
+                          __mask_size_func,
+                          __set_union_op,
+                          __comp,
+                          __proj1,
+                          __proj2,
+                          __buf_raw_data_begin};
+
+        _ParallelSetOpCombinePred __combine_pred;
+
+        _ParallelSetOpApexPred<__Bounded, _IsVector, _ExecutionPolicy, _T*, _SetRange, _OutputIterator,
+                               _DifferenceType1, decltype(__source_final_pos_evaluator)>
+            __apex_pred{__scan_pred};
+
+        __par_backend::__parallel_strict_scan(__backend_tag{}, __exec, __n1, _SetRange(), __reduce_pred, __combine_pred,
+                                              __scan_pred, __apex_pred);
+
+        // Get evaluated reached positions for the source ranges and output range
+        const auto [__res_reachedPos1, __res_reachedPos2, __res_reachedPosOut] =
+            __source_final_pos_evaluator.__get_reached_positions();
+
+        return __parallel_set_op_return_t<_RandomAccessIterator1, _RandomAccessIterator2, _OutputIterator>{
+            __first1 + __res_reachedPos1, __first2 + __res_reachedPos2, __result1 + __res_reachedPosOut};
     });
 }
 
 //a shared parallel pattern for '__pattern_set_union' and '__pattern_set_symmetric_difference'
-template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _RandomAccessIterator2,
-          class _OutputIterator, class _SetUnionOp, class _Compare, class _Proj1, class _Proj2>
-_OutputIterator
+template <bool __Bounded, class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1,
+          class _RandomAccessIterator2, class _OutputIterator, class _Compare, class _Proj1, class _Proj2,
+          class _SetUnionOp>
+oneapi::dpl::__utils::__set_operations_result<_RandomAccessIterator1, _RandomAccessIterator2, _OutputIterator>
 __parallel_set_union_op(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1,
                         _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2,
-                        _OutputIterator __result, _SetUnionOp __set_union_op, _Compare __comp, _Proj1 __proj1,
-                        _Proj2 __proj2)
+                        _OutputIterator __result1, _OutputIterator __result2, _Compare __comp, _Proj1 __proj1,
+                        _Proj2 __proj2, _SetUnionOp __set_union_op)
 {
     using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag;
 
     using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type;
     using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type;
-    using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2>;
+    using _DifferenceTypeOutput = typename std::iterator_traits<_OutputIterator>::difference_type;
+    using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2, _DifferenceTypeOutput>;
 
-    const auto __n1 = __last1 - __first1;
-    const auto __n2 = __last2 - __first2;
+    const _DifferenceType1 __n1 = __last1 - __first1;
+    const _DifferenceType2 __n2 = __last2 - __first2;
+    const _DifferenceTypeOutput __n_out = __result2 - __result1;
 
     __brick_copy<__parallel_tag<_IsVector>> __copy_range{};
 
     // {1} {}: parallel copying just first sequence
     if (__n2 == 0)
-        return __internal::__pattern_walk2_brick(__tag, ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1,
-                                                 __result, __copy_range);
+    {
+        _RandomAccessIterator1 __last1_tmp = !__Bounded ? __last1 : __first1 + std::min<_DifferenceType>(__n1, __n_out);
+
+        _OutputIterator __result_finish = __internal::__pattern_walk2_brick(
+            __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1_tmp, __result1, __copy_range);
+
+        return {__last1_tmp, __first2, __result_finish};
+    }
 
     // {} {2}: parallel copying just second sequence
     if (__n1 == 0)
-        return __internal::__pattern_walk2_brick(__tag, ::std::forward<_ExecutionPolicy>(__exec), __first2, __last2,
-                                                 __result, __copy_range);
+    {
+        _RandomAccessIterator2 __last2_tmp = !__Bounded ? __last2 : __first2 + std::min<_DifferenceType>(__n2, __n_out);
+
+        _OutputIterator __result_finish = __internal::__pattern_walk2_brick(
+            __tag, std::forward<_ExecutionPolicy>(__exec), __first2, __last2_tmp, __result1, __copy_range);
+
+        return {__first1, __last2_tmp, __result_finish};
+    }
 
     // testing  whether the sequences are intersected
     _RandomAccessIterator1 __left_bound_seq_1 =
@@ -3419,16 +4113,26 @@ __parallel_set_union_op(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __ex
 
     if (__left_bound_seq_1 == __last1)
     {
+        _RandomAccessIterator1 __last1_tmp = !__Bounded ? __last1 : __first1 + std::min<_DifferenceType>(__n1, __n_out);
+        const _DifferenceType1 __n1_tmp = __last1_tmp - __first1;
+
+        _RandomAccessIterator2 __last2_tmp =
+            !__Bounded ? __last2
+                       : __first2 + std::min<_DifferenceType>(__n2, __n_out > __n1_tmp ? __n_out - __n1_tmp : 0);
+        const _DifferenceType2 __n2_tmp = __last2_tmp - __first2;
+
         //{1} < {2}: seq2 is wholly greater than seq1, so, do parallel copying seq1 and seq2
         __par_backend::__parallel_invoke(
             __backend_tag{}, __exec,
             [=, &__exec] {
-                __internal::__pattern_walk2_brick(__tag, __exec, __first1, __last1, __result, __copy_range);
+                __internal::__pattern_walk2_brick(__tag, __exec, __first1, __last1_tmp, __result1, __copy_range);
             },
             [=, &__exec] {
-                __internal::__pattern_walk2_brick(__tag, __exec, __first2, __last2, __result + __n1, __copy_range);
+                __internal::__pattern_walk2_brick(__tag, __exec, __first2, __last2_tmp, __result1 + __n1_tmp,
+                                                  __copy_range);
             });
-        return __result + __n1 + __n2;
+
+        return {__last1_tmp, __last2_tmp, __result1 + __n1_tmp + __n2_tmp};
     }
 
     // testing  whether the sequences are intersected
@@ -3438,62 +4142,91 @@ __parallel_set_union_op(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __ex
 
     if (__left_bound_seq_2 == __last2)
     {
+        _RandomAccessIterator2 __last2_tmp = !__Bounded ? __last2 : __first2 + std::min<_DifferenceType>(__n2, __n_out);
+        const _DifferenceType2 __n2_tmp = __last2_tmp - __first2;
+
+        _RandomAccessIterator1 __last1_tmp =
+            !__Bounded ? __last1
+                       : __first1 + std::min<_DifferenceType>(__n1, __n_out > __n2_tmp ? __n_out - __n2_tmp : 0);
+        const _DifferenceType1 __n1_tmp = __last1_tmp - __first1;
+
         //{2} < {1}: seq2 is wholly greater than seq1, so, do parallel copying seq1 and seq2
         __par_backend::__parallel_invoke(
             __backend_tag{}, __exec,
             [=, &__exec] {
-                __internal::__pattern_walk2_brick(__tag, __exec, __first2, __last2, __result, __copy_range);
+                __internal::__pattern_walk2_brick(__tag, __exec, __first2, __last2_tmp, __result1, __copy_range);
             },
             [=, &__exec] {
-                __internal::__pattern_walk2_brick(__tag, __exec, __first1, __last1, __result + __n2, __copy_range);
+                __internal::__pattern_walk2_brick(__tag, __exec, __first1, __last1_tmp, __result1 + __n2_tmp,
+                                                  __copy_range);
             });
-        return __result + __n1 + __n2;
+
+        return {__last1_tmp, __last2_tmp, __result1 + __n1_tmp + __n2_tmp};
     }
 
-    const auto __m1 = __left_bound_seq_1 - __first1;
-    if (__m1 > __set_algo_cut_off)
+    auto __size_fnc = [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; };
+    auto __mask_size_fnc = __size_fnc;
+
+    const _DifferenceType1 __m1 = __left_bound_seq_1 - __first1;
+    if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__m1))
     {
-        auto __res_or = __result;
-        __result += __m1; //we know proper offset due to [first1; left_bound_seq_1) < [first2; last2)
+        oneapi::dpl::__utils::__set_operations_result<_RandomAccessIterator1, _RandomAccessIterator2, _OutputIterator>
+            __finish;
+
+        const _DifferenceType __to_copy = __Bounded ? std::min<_DifferenceType>(__m1, __n_out) : __m1;
+
         __par_backend::__parallel_invoke(
             __backend_tag{}, __exec,
             //do parallel copying of [first1; left_bound_seq_1)
             [=, &__exec] {
-                __internal::__pattern_walk2_brick(__tag, __exec, __first1, __left_bound_seq_1, __res_or, __copy_range);
+                __internal::__pattern_walk2_brick(__tag, __exec, __first1, __first1 + __to_copy, __result1,
+                                                  __copy_range);
             },
-            [=, &__exec, &__result] {
-                __result = __internal::__parallel_set_op(
-                    __tag, __exec, __left_bound_seq_1, __last1, __first2, __last2, __result,
-                    [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; }, __set_union_op, __comp, __proj1,
-                    __proj2);
+            [=, &__exec, &__finish] {
+                __finish = __internal::__parallel_set_op<__Bounded>(
+                    __tag, __exec, __left_bound_seq_1, __last1, __first2, __last2, __result1 + __to_copy, __result2,
+                    __comp, __proj1, __proj2, __size_fnc, __mask_size_fnc, __set_union_op);
             });
-        return __result;
+
+        if constexpr (__Bounded)
+            if (__to_copy < __m1)
+                __finish.__in1 = __first1 + __to_copy;
+
+        return __finish;
     }
 
-    const auto __m2 = __left_bound_seq_2 - __first2;
+    const _DifferenceType2 __m2 = __left_bound_seq_2 - __first2;
     assert(__m1 == 0 || __m2 == 0);
-    if (__m2 > __set_algo_cut_off)
+    if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__m2))
     {
-        auto __res_or = __result;
-        __result += __m2; //we know proper offset due to [first2; left_bound_seq_2) < [first1; last1)
+        oneapi::dpl::__utils::__set_operations_result<_RandomAccessIterator1, _RandomAccessIterator2, _OutputIterator>
+            __finish;
+
+        const _DifferenceType __to_copy = __Bounded ? std::min<_DifferenceType>(__m2, __n_out) : __m2;
+
         __par_backend::__parallel_invoke(
             __backend_tag{}, __exec,
             //do parallel copying of [first2; left_bound_seq_2)
             [=, &__exec] {
-                __internal::__pattern_walk2_brick(__tag, __exec, __first2, __left_bound_seq_2, __res_or, __copy_range);
+                __internal::__pattern_walk2_brick(__tag, __exec, __first2, __first2 + __to_copy, __result1,
+                                                  __copy_range);
             },
-            [=, &__exec, &__result] {
-                __result = __internal::__parallel_set_op(
-                    __tag, __exec, __first1, __last1, __left_bound_seq_2, __last2, __result,
-                    [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; }, __set_union_op, __comp, __proj1,
-                    __proj2);
+            [=, &__exec, &__finish] {
+                __finish = __internal::__parallel_set_op<__Bounded>(
+                    __tag, __exec, __first1, __last1, __left_bound_seq_2, __last2, __result1 + __to_copy, __result2,
+                    __comp, __proj1, __proj2, __size_fnc, __mask_size_fnc, __set_union_op);
             });
-        return __result;
+
+        if constexpr (__Bounded)
+            if (__to_copy < __m2)
+                __finish.__in2 = __first2 + __to_copy;
+
+        return __finish;
     }
 
-    return __internal::__parallel_set_op(
-        __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result,
-        [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; }, __set_union_op, __comp, __proj1, __proj2);
+    return __internal::__parallel_set_op<__Bounded>(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1,
+                                                    __first2, __last2, __result1, __result2, __comp, __proj1, __proj2,
+                                                    __size_fnc, __mask_size_fnc, __set_union_op);
 }
 
 //------------------------------------------------------------------------
@@ -3550,24 +4283,24 @@ __pattern_set_union(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec,
                     _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2,
                     _OutputIterator __result, _Compare __comp)
 {
-    const auto __n1 = __last1 - __first1;
-    const auto __n2 = __last2 - __first2;
+    using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type;
+    using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type;
+
+    const _DifferenceType1 __n1 = __last1 - __first1;
+    const _DifferenceType2 __n2 = __last2 - __first2;
 
     // use serial algorithm
-    if (__n1 + __n2 <= __set_algo_cut_off)
+    if (!oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__n1 + __n2))
         return std::set_union(__first1, __last1, __first2, __last2, __result, __comp);
 
-    using _Tp = typename std::iterator_traits<_OutputIterator>::value_type;
-    return __parallel_set_union_op(
-        __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result,
-        [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2,
-           _RandomAccessIterator2 __last2, _Tp* __result, _Compare __comp, oneapi::dpl::identity,
-           oneapi::dpl::identity) {
-            return oneapi::dpl::__utils::__set_union_construct(__first1, __last1, __first2, __last2, __result,
-                                                               __BrickCopyConstruct<_IsVector>(), __comp,
-                                                               oneapi::dpl::identity{}, oneapi::dpl::identity{});
-        },
-        __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{});
+    return __parallel_set_union_op</*__Bounded*/ false>(
+               __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result,
+               __result + __n1 + __n2, __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{},
+               [](auto&&... __args) {
+                   return oneapi::dpl::__utils::__set_union_construct<__BrickCopyConstruct<_IsVector>>(
+                       std::forward<decltype(__args)>(__args)...);
+               })
+        .__get_reached_out();
 }
 
 //------------------------------------------------------------------------
@@ -3614,14 +4347,12 @@ __pattern_set_intersection(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& _
                            _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2,
                            _RandomAccessIterator2 __last2, _RandomAccessIterator3 __result, _Compare __comp)
 {
-    using _T = typename std::iterator_traits<_RandomAccessIterator3>::value_type;
-
     using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type;
     using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type;
     using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2>;
 
-    const auto __n1 = __last1 - __first1;
-    const auto __n2 = __last2 - __first2;
+    const _DifferenceType1 __n1 = __last1 - __first1;
+    const _DifferenceType2 __n2 = __last2 - __first2;
 
     // intersection is empty
     if (__n1 == 0 || __n2 == 0)
@@ -3639,44 +4370,41 @@ __pattern_set_intersection(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& _
     if (__left_bound_seq_2 == __last2)
         return __result;
 
-    const auto __m1 = __last1 - __left_bound_seq_1 + __n2;
-    if (__m1 > __set_algo_cut_off)
+    const _DifferenceType __m1 = __last1 - __left_bound_seq_1 + __n2;
+    if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__m1))
     {
         //we know proper offset due to [first1; left_bound_seq_1) < [first2; last2)
         return __internal::__except_handler([&]() {
-            return __internal::__parallel_set_op(
-                __tag, std::forward<_ExecutionPolicy>(__exec), __left_bound_seq_1, __last1, __first2, __last2, __result,
-                [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); },
-                [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2,
-                   _RandomAccessIterator2 __last2, _T* __result, _Compare __comp, oneapi::dpl::identity,
-                   oneapi::dpl::identity) {
-                    return oneapi::dpl::__utils::__set_intersection_construct(
-                        __first1, __last1, __first2, __last2, __result,
-                        oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>{}, __comp,
-                        oneapi::dpl::identity{}, oneapi::dpl::identity{});
-                },
-                __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{});
+            return __internal::__parallel_set_op</*__Bounded*/ false>(
+                       __tag, std::forward<_ExecutionPolicy>(__exec), __left_bound_seq_1, __last1, __first2, __last2,
+                       __result, __result + __n1 + __n2, __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{},
+                       [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); },
+                       [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; },
+                       [](auto&&... __args) {
+                           return oneapi::dpl::__utils::__set_intersection_construct<
+                               oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>>(
+                               std::forward<decltype(__args)>(__args)...);
+                       })
+                .__get_reached_out();
         });
     }
 
-    const auto __m2 = __last2 - __left_bound_seq_2 + __n1;
-    if (__m2 > __set_algo_cut_off)
+    const _DifferenceType __m2 = __last2 - __left_bound_seq_2 + __n1;
+    if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__m2))
     {
         //we know proper offset due to [first2; left_bound_seq_2) < [first1; last1)
         return __internal::__except_handler([&]() {
-            __result = __internal::__parallel_set_op(
-                __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __left_bound_seq_2, __last2, __result,
-                [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); },
-                [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2,
-                   _RandomAccessIterator2 __last2, _T* __result, _Compare __comp, oneapi::dpl::identity,
-                   oneapi::dpl::identity) {
-                    return oneapi::dpl::__utils::__set_intersection_construct(
-                        __first1, __last1, __first2, __last2, __result,
-                        oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>{}, __comp,
-                        oneapi::dpl::identity{}, oneapi::dpl::identity{});
-                },
-                __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{});
-            return __result;
+            return __internal::__parallel_set_op</*__Bounded*/ false>(
+                       __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __left_bound_seq_2, __last2,
+                       __result, __result + __n1 + __n2, __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{},
+                       [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); },
+                       [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; },
+                       [](auto&&... __args) {
+                           return oneapi::dpl::__utils::__set_intersection_construct<
+                               oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>>(
+                               std::forward<decltype(__args)>(__args)...);
+                       })
+                .__get_reached_out();
         });
     }
 
@@ -3727,11 +4455,12 @@ __pattern_set_difference(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __e
                          _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2,
                          _RandomAccessIterator2 __last2, _RandomAccessIterator3 __result, _Compare __comp)
 {
-    using _T = typename std::iterator_traits<_RandomAccessIterator3>::value_type;
-    using _DifferenceType = typename std::iterator_traits<_RandomAccessIterator1>::difference_type;
+    using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type;
+    using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type;
+    using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2>;
 
-    const auto __n1 = __last1 - __first1;
-    const auto __n2 = __last2 - __first2;
+    const _DifferenceType1 __n1 = __last1 - __first1;
+    const _DifferenceType2 __n2 = __last2 - __first2;
 
     // {} \ {2}: the difference is empty
     if (__n1 == 0)
@@ -3756,18 +4485,19 @@ __pattern_set_difference(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __e
         return __internal::__pattern_walk2_brick(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1,
                                                  __result, __brick_copy<__parallel_tag<_IsVector>>{});
 
-    if (__n1 + __n2 > __set_algo_cut_off)
-        return __parallel_set_op(
-            __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result,
-            [](_DifferenceType __n, _DifferenceType) { return __n; },
-            [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2,
-               _RandomAccessIterator2 __last2, _T* __result, _Compare __comp, oneapi::dpl::identity,
-               oneapi::dpl::identity) {
-                return oneapi::dpl::__utils::__set_difference_construct(
-                    __first1, __last1, __first2, __last2, __result, __BrickCopyConstruct<_IsVector>(), __comp,
-                    oneapi::dpl::identity{}, oneapi::dpl::identity{});
-            },
-            __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{});
+    if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__n1 + __n2))
+    {
+        return __parallel_set_op</*__Bounded*/ false>(
+                   __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result,
+                   __result + __n1 + __n2, __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{},
+                   [](_DifferenceType __n, _DifferenceType) { return __n; },
+                   [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; },
+                   [](auto&&... __args) {
+                       return oneapi::dpl::__utils::__set_difference_construct<__BrickCopyConstruct<_IsVector>>(
+                           std::forward<decltype(__args)>(__args)...);
+                   })
+            .__get_reached_out();
+    }
 
     // use serial algorithm
     return std::set_difference(__first1, __last1, __first2, __last2, __result, __comp);
@@ -3818,25 +4548,25 @@ __pattern_set_symmetric_difference(__parallel_tag<_IsVector> __tag, _ExecutionPo
                                    _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2,
                                    _RandomAccessIterator3 __result, _Compare __comp)
 {
-    const auto __n1 = __last1 - __first1;
-    const auto __n2 = __last2 - __first2;
+    using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type;
+    using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type;
+
+    const _DifferenceType1 __n1 = __last1 - __first1;
+    const _DifferenceType2 __n2 = __last2 - __first2;
 
     // use serial algorithm
-    if (__n1 + __n2 <= __set_algo_cut_off)
+    if (!oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__n1 + __n2))
         return std::set_symmetric_difference(__first1, __last1, __first2, __last2, __result, __comp);
 
-    using _T = typename std::iterator_traits<_RandomAccessIterator3>::value_type;
     return __internal::__except_handler([&]() {
-        return __internal::__parallel_set_union_op(
-            __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result,
-            [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2,
-               _RandomAccessIterator2 __last2, _T* __result, _Compare __comp, oneapi::dpl::identity,
-               oneapi::dpl::identity) {
-                return oneapi::dpl::__utils::__set_symmetric_difference_construct(
-                    __first1, __last1, __first2, __last2, __result, __BrickCopyConstruct<_IsVector>(), __comp,
-                    oneapi::dpl::identity{}, oneapi::dpl::identity{});
-            },
-            __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{});
+        return __internal::__parallel_set_union_op</*__Bounded*/ false>(
+                   __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result,
+                   __result + __n1 + __n2, __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{},
+                   [](auto&&... __args) {
+                       return oneapi::dpl::__utils::__set_symmetric_difference_construct<
+                           __BrickCopyConstruct<_IsVector>>(std::forward<decltype(__args)>(__args)...);
+                   })
+            .__get_reached_out();
     });
 }
 
diff --git a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h
index 6d7fa7df3bd..f914bc3ce6a 100644
--- a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h
+++ b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h
@@ -727,7 +727,7 @@ __pattern_includes(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _
     const auto __n2 = std::ranges::size(__r2);
 
     // use serial algorithm
-    if (__n1 + __n2 <= oneapi::dpl::__internal::__set_algo_cut_off)
+    if (!oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__n1 + __n2))
         return std::ranges::includes(std::forward<_R1>(__r1), std::forward<_R2>(__r2), __comp, __proj1, __proj2);
 
     auto __first1 = std::ranges::begin(__r1);
@@ -805,13 +805,60 @@ using __set_union_return_t =
     std::ranges::set_union_result<std::ranges::borrowed_iterator_t<_R1>, std::ranges::borrowed_iterator_t<_R2>,
                                   std::ranges::borrowed_iterator_t<_OutRange>>;
 
+// Bounded set union: performs set_union with output range capacity checking.
+// Truncates result if output range is too small.
+template <std::ranges::random_access_range _R1, std::ranges::random_access_range _R2,
+          std::ranges::random_access_range _OutRange, typename _Comp, typename _Proj1, typename _Proj2>
+__set_union_return_t<_R1, _R2, _OutRange>
+__serial_set_union(_R1&& __r1, _R2&& __r2, _OutRange&& __r_out, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2)
+{
+    using DifferenceType = oneapi::dpl::__ranges::__common_size_t<decltype(__r1), decltype(__r2), decltype(__r_out)>;
+
+    auto [__it1, __end1] = oneapi::dpl::__ranges::__get_range_bounds(__r1);
+    auto [__it2, __end2] = oneapi::dpl::__ranges::__get_range_bounds(__r2);
+    auto [__out_it, __out_end] = oneapi::dpl::__ranges::__get_range_bounds(__r_out);
+
+    // 1. Main set_union operation
+    while (__it1 != __end1 && __it2 != __end2 && __out_it != __out_end)
+    {
+        if (std::invoke(__comp, std::invoke(__proj1, *__it1), std::invoke(__proj2, *__it2)))
+        {
+            *__out_it = *__it1;
+            ++__it1;
+        }
+        else if (std::invoke(__comp, std::invoke(__proj2, *__it2), std::invoke(__proj1, *__it1)))
+        {
+            *__out_it = *__it2;
+            ++__it2;
+        }
+        else
+        {
+            *__out_it = *__it1;
+            ++__it1;
+            ++__it2;
+        }
+        ++__out_it;
+    }
+
+    // 2. Copying the residual elements if one of the input sequences is exhausted
+    const DifferenceType __remaining_capacity1 = __out_end - __out_it;
+    const DifferenceType __copy_n1 = __end1 - __it1;
+    auto __copy1 = std::ranges::copy(__it1, __it1 + std::min(__copy_n1, __remaining_capacity1), __out_it);
+
+    const DifferenceType __remaining_capacity2 = __out_end - __copy1.out;
+    const DifferenceType __copy_n2 = __end2 - __it2;
+    auto __copy2 = std::ranges::copy(__it2, __it2 + std::min(__copy_n2, __remaining_capacity2), __copy1.out);
+
+    return {__copy1.in, __copy2.in, __copy2.out};
+}
+
 template <typename _R1, typename _R2, typename _OutRange, typename _Comp, typename _Proj1, typename _Proj2>
 __set_union_return_t<_R1, _R2, _OutRange>
 __brick_set_union(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2,
                   /*__is_vector=*/std::false_type) noexcept
 {
-    return std::ranges::set_union(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::ranges::begin(__out_r), __comp,
-                                  __proj1, __proj2);
+    return __serial_set_union(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r),
+                              __comp, __proj1, __proj2);
 }
 
 template <typename _R1, typename _R2, typename _OutRange, typename _Comp, typename _Proj1, typename _Proj2>
@@ -820,8 +867,8 @@ __brick_set_union(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Pr
                   /*__is_vector=*/std::true_type) noexcept
 {
     _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial");
-    return std::ranges::set_union(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::ranges::begin(__out_r), __comp,
-                                  __proj1, __proj2);
+    return __serial_set_union(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r),
+                              __comp, __proj1, __proj2);
 }
 
 template <typename _Tag, typename _ExecutionPolicy, typename _R1, typename _R2, typename _OutRange, typename _Comp,
@@ -830,7 +877,7 @@ __set_union_return_t<_R1, _R2, _OutRange>
 __pattern_set_union(_Tag, _ExecutionPolicy&&, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1,
                     _Proj2 __proj2)
 {
-    static_assert(__is_serial_tag_v<_Tag>);
+    static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>);
 
     return __brick_set_union(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r), __comp,
                              __proj1, __proj2, typename _Tag::__is_vector{});
@@ -842,34 +889,23 @@ __set_union_return_t<_R1, _R2, _OutRange>
 __pattern_set_union(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _R1&& __r1, _R2&& __r2,
                     _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2)
 {
-    using _RandomAccessIterator1 = std::ranges::iterator_t<_R1>;
-    using _RandomAccessIterator2 = std::ranges::iterator_t<_R2>;
-    using _Tp = std::ranges::range_value_t<_OutRange>;
-
-    const auto __n1 = std::ranges::size(__r1);
-    const auto __n2 = std::ranges::size(__r2);
+    auto [__first1, __last1, __n1] = oneapi::dpl::__ranges::__get_range_bounds_n(__r1);
+    auto [__first2, __last2, __n2] = oneapi::dpl::__ranges::__get_range_bounds_n(__r2);
+    auto [__result1, __result2] = oneapi::dpl::__ranges::__get_range_bounds(__out_r);
 
     // use serial algorithm
-    if (__n1 + __n2 <= oneapi::dpl::__internal::__set_algo_cut_off)
-        return std::ranges::set_union(__r1, __r2, std::begin(__out_r), __comp, __proj1, __proj2);
-
-    auto __first1 = std::ranges::begin(__r1);
-    auto __last1 = __first1 + __n1;
-    auto __first2 = std::ranges::begin(__r2);
-    auto __last2 = __first2 + __n2;
-    auto __result = std::ranges::begin(__out_r);
+    if (!oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__n1 + __n2))
+        return __serial_set_union(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r),
+                                  __comp, __proj1, __proj2);
 
-    auto __out_last = oneapi::dpl::__internal::__parallel_set_union_op(
-        __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result,
-        [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2,
-           _RandomAccessIterator2 __last2, _Tp* __result, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) {
-            return oneapi::dpl::__utils::__set_union_construct(
-                __first1, __last1, __first2, __last2, __result,
-                oneapi::dpl::__internal::__BrickCopyConstruct<_IsVector>(), __comp, __proj1, __proj2);
-        },
-        __comp, __proj1, __proj2);
-
-    return {__first1 + __n1, __first2 + __n2, __result + (__out_last - __result)};
+    return oneapi::dpl::__internal::__parallel_set_union_op</*__Bounded*/ true>(
+               __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result1,
+               __result2, __comp, __proj1, __proj2,
+               [](auto&&... __args) {
+                   return oneapi::dpl::__utils::__set_union_construct<__BrickCopyConstruct<_IsVector>>(
+                       std::forward<decltype(__args)>(__args)...);
+               })
+        .template __get_reached_in1_in2_out<__set_union_return_t<_R1, _R2, _OutRange>>();
 }
 
 //---------------------------------------------------------------------------------------------------------------------
@@ -881,13 +917,49 @@ using __set_intersection_return_t =
     std::ranges::set_intersection_result<std::ranges::borrowed_iterator_t<_R1>, std::ranges::borrowed_iterator_t<_R2>,
                                          std::ranges::borrowed_iterator_t<_OutRange>>;
 
+// Bounded set intersection: performs set_intersection with output range capacity checking.
+// Truncates result if output range is too small.
+template <typename _R1, typename _R2, typename _OutRange, typename _Comp, typename _Proj1, typename _Proj2>
+__set_intersection_return_t<_R1, _R2, _OutRange>
+__serial_set_intersection(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2)
+{
+    auto [__it1, __end1] = oneapi::dpl::__ranges::__get_range_bounds(__r1);
+    auto [__it2, __end2] = oneapi::dpl::__ranges::__get_range_bounds(__r2);
+    auto [__out_it, __out_end] = oneapi::dpl::__ranges::__get_range_bounds(__out_r);
+
+    while (__it1 != __end1 && __it2 != __end2)
+    {
+        if (std::invoke(__comp, std::invoke(__proj1, *__it1), std::invoke(__proj2, *__it2)))
+        {
+            ++__it1;
+        }
+        else if (std::invoke(__comp, std::invoke(__proj2, *__it2), std::invoke(__proj1, *__it1)))
+        {
+            ++__it2;
+        }
+        else if (__out_it != __out_end)
+        {
+            *__out_it = *__it1;
+            ++__out_it;
+            ++__it1;
+            ++__it2;
+        }
+        else
+        {
+            break;
+        }
+    }
+
+    return {__it1, __it2, __out_it};
+}
+
 template <typename _R1, typename _R2, typename _OutRange, typename _Comp, typename _Proj1, typename _Proj2>
 __set_intersection_return_t<_R1, _R2, _OutRange>
 __brick_set_intersection(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2,
                          /*__is_vector=*/std::false_type) noexcept
 {
-    return std::ranges::set_intersection(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::ranges::begin(__out_r),
-                                         __comp, __proj1, __proj2);
+    return __serial_set_intersection(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r),
+                                     __comp, __proj1, __proj2);
 }
 
 template <typename _R1, typename _R2, typename _OutRange, typename _Comp, typename _Proj1, typename _Proj2>
@@ -896,8 +968,8 @@ __brick_set_intersection(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __co
                          /*__is_vector=*/std::true_type) noexcept
 {
     _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial");
-    return std::ranges::set_intersection(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::ranges::begin(__out_r),
-                                         __comp, __proj1, __proj2);
+    return __serial_set_intersection(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r),
+                                     __comp, __proj1, __proj2);
 }
 
 template <typename _Tag, typename _ExecutionPolicy, typename _R1, typename _R2, typename _OutRange, typename _Comp,
@@ -906,7 +978,7 @@ __set_intersection_return_t<_R1, _R2, _OutRange>
 __pattern_set_intersection(_Tag, _ExecutionPolicy&&, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp,
                            _Proj1 __proj1, _Proj2 __proj2)
 {
-    static_assert(__is_serial_tag_v<_Tag>);
+    static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>);
 
     return __brick_set_intersection(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r),
                                     __comp, __proj1, __proj2, typename _Tag::__is_vector{});
@@ -920,24 +992,18 @@ __pattern_set_intersection(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& _
 {
     using _RandomAccessIterator1 = std::ranges::iterator_t<_R1>;
     using _RandomAccessIterator2 = std::ranges::iterator_t<_R2>;
-    using _T = std::ranges::range_value_t<_OutRange>;
 
     using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type;
     using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type;
     using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2>;
 
-    const auto __n1 = std::ranges::size(__r1);
-    const auto __n2 = std::ranges::size(__r2);
-
-    auto __first1 = std::ranges::begin(__r1);
-    auto __last1 = __first1 + __n1;
-    auto __first2 = std::ranges::begin(__r2);
-    auto __last2 = __first2 + __n2;
-    auto __result = std::ranges::begin(__out_r);
+    auto [__first1, __last1, __n1] = oneapi::dpl::__ranges::__get_range_bounds_n(__r1);
+    auto [__first2, __last2, __n2] = oneapi::dpl::__ranges::__get_range_bounds_n(__r2);
+    auto [__result1, __result2] = oneapi::dpl::__ranges::__get_range_bounds(__out_r);
 
     // intersection is empty
     if (__n1 == 0 || __n2 == 0)
-        return {__last1, __last2, __result};
+        return {__first1, __first2, __result1};
 
     // testing  whether the sequences are intersected
     auto __left_bound_seq_1 =
@@ -945,7 +1011,7 @@ __pattern_set_intersection(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& _
                                                                __first2, __comp, __proj1, __proj2);
     //{1} < {2}: seq 2 is wholly greater than seq 1, so, the intersection is empty
     if (__left_bound_seq_1 == __last1)
-        return {__last1, __last2, __result};
+        return {__last1, __first2, __result1};
 
     // testing  whether the sequences are intersected
     auto __left_bound_seq_2 =
@@ -953,85 +1019,151 @@ __pattern_set_intersection(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& _
                                                                __first1, __comp, __proj2, __proj1);
     //{2} < {1}: seq 1 is wholly greater than seq 2, so, the intersection is empty
     if (__left_bound_seq_2 == __last2)
-        return {__last1, __last2, __result};
+        return {__first1, __last2, __result1};
 
     const auto __m1 = __last1 - __left_bound_seq_1 + __n2;
-    if (__m1 > oneapi::dpl::__internal::__set_algo_cut_off)
+    if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__m1))
     {
         //we know proper offset due to [first1; left_bound_seq_1) < [first2; last2)
         return __internal::__except_handler([&]() {
-            auto __out_last = __internal::__parallel_set_op(
-                __tag, std::forward<_ExecutionPolicy>(__exec), __left_bound_seq_1, __last1, __first2, __last2, __result,
-                [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); },
-                [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2,
-                   _RandomAccessIterator2 __last2, _T* __result, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) {
-                    return oneapi::dpl::__utils::__set_intersection_construct(
-                        __first1, __last1, __first2, __last2, __result,
-                        oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>{}, __comp, __proj1, __proj2);
-                },
-                __comp, __proj1, __proj2);
-            return __set_intersection_return_t<_R1, _R2, _OutRange>{__last1, __last2, __out_last};
+            return __internal::__parallel_set_op</*__Bounded*/ true>(
+                       __tag, std::forward<_ExecutionPolicy>(__exec), __left_bound_seq_1, __last1, __first2, __last2,
+                       __result1, __result2, __comp, __proj1, __proj2,
+                       [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); },
+                       [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; },
+                       [](auto&&... __args) {
+                           return oneapi::dpl::__utils::__set_intersection_construct<
+                               oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>>(
+                               std::forward<decltype(__args)>(__args)...);
+                       })
+                .template __get_reached_in1_in2_out<__set_intersection_return_t<_R1, _R2, _OutRange>>();
         });
     }
 
     const auto __m2 = __last2 - __left_bound_seq_2 + __n1;
-    if (__m2 > oneapi::dpl::__internal::__set_algo_cut_off)
+    if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__m2))
     {
         //we know proper offset due to [first2; left_bound_seq_2) < [first1; last1)
         return __internal::__except_handler([&]() {
-            auto __out_last = __internal::__parallel_set_op(
-                __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __left_bound_seq_2, __last2, __result,
-                [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); },
-                [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2,
-                   _RandomAccessIterator2 __last2, _T* __result, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) {
-                    return oneapi::dpl::__utils::__set_intersection_construct(
-                        __first1, __last1, __first2, __last2, __result,
-                        oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>{}, __comp, __proj1, __proj2);
-                },
-                __comp, __proj1, __proj2);
-            return __set_intersection_return_t<_R1, _R2, _OutRange>{__last1, __last2, __out_last};
+            return __internal::__parallel_set_op</*__Bounded*/ true>(
+                       __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __left_bound_seq_2, __last2,
+                       __result1, __result2, __comp, __proj1, __proj2,
+                       [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); },
+                       [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; },
+                       [](auto&&... __args) {
+                           return oneapi::dpl::__utils::__set_intersection_construct<
+                               oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>>(
+                               std::forward<decltype(__args)>(__args)...);
+                       })
+                .template __get_reached_in1_in2_out<__set_intersection_return_t<_R1, _R2, _OutRange>>();
         });
     }
 
     // [left_bound_seq_1; last1) and [left_bound_seq_2; last2) - use serial algorithm
-    return std::ranges::set_intersection(__left_bound_seq_1, __last1, __left_bound_seq_2, __last2,
-                                         std::ranges::begin(__out_r), __comp, __proj1, __proj2);
+    return __serial_set_intersection(std::ranges::subrange(__left_bound_seq_1, __last1),
+                                     std::ranges::subrange(__left_bound_seq_2, __last2), __out_r, __comp, __proj1,
+                                     __proj2);
 }
 
 //---------------------------------------------------------------------------------------------------------------------
 // set_difference
 //---------------------------------------------------------------------------------------------------------------------
 
-template <typename _R1, typename _OutRange>
-using __set_difference_return_t = std::ranges::set_difference_result<std::ranges::borrowed_iterator_t<_R1>,
-                                                                     std::ranges::borrowed_iterator_t<_OutRange>>;
+#if ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT
+template <typename _R1, typename _R2, typename _OutRange>
+using __set_difference_return_t =
+    std::ranges::in_out_result<std::ranges::borrowed_iterator_t<_R1>, std::ranges::borrowed_iterator_t<_OutRange>>;
+#else
+template <typename _R1, typename _R2, typename _OutRange>
+using __set_difference_return_t =
+    std::ranges::in_in_out_result<std::ranges::borrowed_iterator_t<_R1>, std::ranges::borrowed_iterator_t<_R2>,
+                                  std::ranges::borrowed_iterator_t<_OutRange>>;
+#endif
+
+// Helper function to create the appropriate return type for oneapi::dpl::ranges::set_difference based on C++23 compatibility mode.
+// In C++23, set_difference returns in_out_result with the second input iterator omitted, as it is not needed for the caller.
+template <typename _R1, typename _R2, typename _OutRange, typename _It1, typename _It2, typename _ItOut>
+__set_difference_return_t<_R1, _R2, _OutRange>
+__create_set_difference_result(_It1 __it1, _It2 __it2, _ItOut __it_out)
+{
+#if ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT
+    return std::ranges::in_out_result<_It1, _ItOut>{__it1, __it_out};
+#else
+    return std::ranges::in_in_out_result<_It1, _It2, _ItOut>{__it1, __it2, __it_out};
+#endif
+}
+
+// Bounded set difference: performs set_difference with output range capacity checking.
+// Truncates result if output range is too small.
+template <typename _R1, typename _R2, typename _OutRange, typename _Comp, typename _Proj1, typename _Proj2>
+__set_difference_return_t<_R1, _R2, _OutRange>
+__serial_set_difference(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2)
+{
+    using DifferenceType = oneapi::dpl::__ranges::__common_size_t<decltype(__r1), decltype(__r2), decltype(__out_r)>;
+
+    auto [__it1, __end1] = oneapi::dpl::__ranges::__get_range_bounds(__r1);
+    auto [__it2, __end2] = oneapi::dpl::__ranges::__get_range_bounds(__r2);
+    auto [__out_it, __out_end] = oneapi::dpl::__ranges::__get_range_bounds(__out_r);
+
+    // 1. Main set_difference operation
+    while (__it1 != __end1 && __it2 != __end2)
+    {
+        if (std::invoke(__comp, std::invoke(__proj1, *__it1), std::invoke(__proj2, *__it2)))
+        {
+            if (__out_it != __out_end)
+            {
+                *__out_it = *__it1;
+                ++__it1;
+                ++__out_it;
+            }
+            else
+                break;
+        }
+        else if (std::invoke(__comp, std::invoke(__proj2, *__it2), std::invoke(__proj1, *__it1)))
+        {
+            ++__it2;
+        }
+        else
+        {
+            ++__it1;
+            ++__it2;
+        }
+    }
+
+    // 2. Copying the rest of the first sequence
+    const DifferenceType __remaining_capacity = __out_end - __out_it;
+    const DifferenceType __copy_n = __end1 - __it1;
+    auto __copy = std::ranges::copy(__it1, __it1 + std::min(__copy_n, __remaining_capacity), __out_it);
+
+    return __create_set_difference_result<_R1, _R2, _OutRange>(__copy.in, __it2, __copy.out);
+}
 
 template <typename _R1, typename _R2, typename _OutRange, typename _Comp, typename _Proj1, typename _Proj2>
-__set_difference_return_t<_R1, _OutRange>
+__set_difference_return_t<_R1, _R2, _OutRange>
 __brick_set_difference(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2,
                        /*__is_vector=*/std::false_type) noexcept
 {
-    return std::ranges::set_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::ranges::begin(__out_r),
-                                       __comp, __proj1, __proj2);
+    return __serial_set_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r),
+                                   __comp, __proj1, __proj2);
 }
 
 template <typename _R1, typename _R2, typename _OutRange, typename _Comp, typename _Proj1, typename _Proj2>
-__set_difference_return_t<_R1, _OutRange>
+__set_difference_return_t<_R1, _R2, _OutRange>
 __brick_set_difference(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2,
                        /*__is_vector=*/std::true_type) noexcept
 {
     _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial");
-    return std::ranges::set_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::ranges::begin(__out_r),
-                                       __comp, __proj1, __proj2);
+    return __serial_set_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r),
+                                   __comp, __proj1, __proj2);
 }
 
 template <typename _Tag, typename _ExecutionPolicy, typename _R1, typename _R2, typename _OutRange, typename _Comp,
           typename _Proj1, typename _Proj2>
-__set_difference_return_t<_R1, _OutRange>
+__set_difference_return_t<_R1, _R2, _OutRange>
 __pattern_set_difference(_Tag, _ExecutionPolicy&&, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp,
                          _Proj1 __proj1, _Proj2 __proj2)
 {
-    static_assert(__is_serial_tag_v<_Tag>);
+    static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>);
 
     return __brick_set_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r),
                                   __comp, __proj1, __proj2, typename _Tag::__is_vector{});
@@ -1039,37 +1171,34 @@ __pattern_set_difference(_Tag, _ExecutionPolicy&&, _R1&& __r1, _R2&& __r2, _OutR
 
 template <class _IsVector, typename _ExecutionPolicy, typename _R1, typename _R2, typename _OutRange, typename _Comp,
           typename _Proj1, typename _Proj2>
-__set_difference_return_t<_R1, _OutRange>
+__set_difference_return_t<_R1, _R2, _OutRange>
 __pattern_set_difference(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _R1&& __r1, _R2&& __r2,
                          _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2)
 {
     using _RandomAccessIterator1 = std::ranges::iterator_t<_R1>;
     using _RandomAccessIterator2 = std::ranges::iterator_t<_R2>;
-    using _T = std::ranges::range_value_t<_OutRange>;
+    using _RandomAccessIteratorOut = std::ranges::iterator_t<_OutRange>;
 
     using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type;
     using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type;
     using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2>;
 
-    const auto __n1 = std::ranges::size(__r1);
-    const auto __n2 = std::ranges::size(__r2);
-
-    auto __first1 = std::ranges::begin(__r1);
-    auto __last1 = __first1 + __n1;
-    auto __first2 = std::ranges::begin(__r2);
-    auto __last2 = __first2 + __n2;
-    auto __result = std::ranges::begin(__out_r);
+    auto [__first1, __last1, __n1] = oneapi::dpl::__ranges::__get_range_bounds_n(__r1);
+    auto [__first2, __last2, __n2] = oneapi::dpl::__ranges::__get_range_bounds_n(__r2);
+    auto [__result1, __result2] = oneapi::dpl::__ranges::__get_range_bounds(__out_r);
 
     // {} \ {2}: the difference is empty
     if (__n1 == 0)
-        return {__first1, __result};
+        return __create_set_difference_result<_R1, _R2, _OutRange>(__first1, __first2, __result1);
 
     // {1} \ {}: parallel copying just first sequence
     if (__n2 == 0)
     {
-        auto __out_last = __pattern_walk2_brick(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1,
-                                                __result, __internal::__brick_copy<__parallel_tag<_IsVector>>{});
-        return {__last1, __out_last};
+        const _DifferenceType __n = std::min(__last1 - __first1, __result2 - __result1);
+        auto __out_last =
+            __internal::__pattern_walk2_brick(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __first1 + __n,
+                                              __result1, __internal::__brick_copy<__parallel_tag<_IsVector>>{});
+        return __create_set_difference_result<_R1, _R2, _OutRange>(__first1 + __n, __first2, __out_last);
     }
 
     // testing  whether the sequences are intersected
@@ -1079,9 +1208,11 @@ __pattern_set_difference(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __e
     //{1} < {2}: seq 2 is wholly greater than seq 1, so, parallel copying just first sequence
     if (__left_bound_seq_1 == __last1)
     {
-        auto __out_last = __pattern_walk2_brick(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1,
-                                                __result, __internal::__brick_copy<__parallel_tag<_IsVector>>{});
-        return {__last1, __out_last};
+        const _DifferenceType __n = std::min(__last1 - __first1, __result2 - __result1);
+        auto __out_last =
+            __internal::__pattern_walk2_brick(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __first1 + __n,
+                                              __result1, __internal::__brick_copy<__parallel_tag<_IsVector>>{});
+        return __create_set_difference_result<_R1, _R2, _OutRange>(__first1 + __n, __first2, __out_last);
     }
 
     // testing  whether the sequences are intersected
@@ -1091,30 +1222,35 @@ __pattern_set_difference(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __e
     //{2} < {1}: seq 1 is wholly greater than seq 2, so, parallel copying just first sequence
     if (__left_bound_seq_2 == __last2)
     {
+        const _DifferenceType __n = std::min(__last1 - __first1, __result2 - __result1);
         auto __out_last =
-            __internal::__pattern_walk2_brick(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1,
-                                              __result, __brick_copy<__parallel_tag<_IsVector>>{});
-        return {__last1, __out_last};
+            __internal::__pattern_walk2_brick(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __first1 + __n,
+                                              __result1, __internal::__brick_copy<__parallel_tag<_IsVector>>{});
+        return __create_set_difference_result<_R1, _R2, _OutRange>(__first1 + __n, __last2, __out_last);
     }
 
-    if (__n1 + __n2 > __set_algo_cut_off)
+    if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__n1 + __n2))
     {
-        auto __out_last = __parallel_set_op(
-            __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result,
-            [](_DifferenceType __n, _DifferenceType) { return __n; },
-            [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2,
-               _RandomAccessIterator2 __last2, _T* __result, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) {
-                return oneapi::dpl::__utils::__set_difference_construct(__first1, __last1, __first2, __last2, __result,
-                                                                        __BrickCopyConstruct<_IsVector>(), __comp,
-                                                                        __proj1, __proj2);
-            },
-            __comp, __proj1, __proj2);
-        return {__last1, __result + (__out_last - __result)};
+        //we know proper offset due to [first2; left_bound_seq_2) < [first1; last1)
+        auto [__it1, __it2, __it_out] =
+            __internal::__parallel_set_op</*__Bounded*/ true>(
+                __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __left_bound_seq_2, __last2,
+                __result1, __result2, __comp, __proj1, __proj2,
+                [](_DifferenceType __n, _DifferenceType) { return __n; },
+                [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; },
+                [](auto&&... __args) {
+                    return oneapi::dpl::__utils::__set_difference_construct<__BrickCopyConstruct<_IsVector>>(
+                        std::forward<decltype(__args)>(__args)...);
+                })
+                .template __get_reached_in1_in2_out<
+                    std::tuple<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIteratorOut>>();
+
+        return __create_set_difference_result<_R1, _R2, _OutRange>(__it1, __it2, __it_out);
     }
 
     // use serial algorithm
-    return std::ranges::set_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::ranges::begin(__out_r),
-                                       __comp, __proj1, __proj2);
+    return __serial_set_difference(std::forward<_R1>(__r1), std::ranges::subrange(__left_bound_seq_2, __last2),
+                                   std::forward<_OutRange>(__out_r), __comp, __proj1, __proj2);
 }
 
 //---------------------------------------------------------------------------------------------------------------------
@@ -1127,14 +1263,71 @@ using __set_symmetric_difference_return_t =
                                                  std::ranges::borrowed_iterator_t<_R2>,
                                                  std::ranges::borrowed_iterator_t<_OutRange>>;
 
+// Bounded set symmetric difference: performs set_symmetric_difference with output range capacity checking.
+// Truncates result if output range is too small.
+template <typename _R1, typename _R2, typename _OutRange, typename _Comp, typename _Proj1, typename _Proj2>
+__set_symmetric_difference_return_t<_R1, _R2, _OutRange>
+__serial_set_symmetric_difference(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1,
+                                  _Proj2 __proj2)
+{
+    using DifferenceType = oneapi::dpl::__ranges::__common_size_t<decltype(__r1), decltype(__r2), decltype(__out_r)>;
+
+    auto [__it1, __end1] = oneapi::dpl::__ranges::__get_range_bounds(__r1);
+    auto [__it2, __end2] = oneapi::dpl::__ranges::__get_range_bounds(__r2);
+    auto [__out_it, __out_end] = oneapi::dpl::__ranges::__get_range_bounds(__out_r);
+
+    // 1. Main set_symmetric_difference operation
+    while (__it1 != __end1 && __it2 != __end2)
+    {
+        if (std::invoke(__comp, std::invoke(__proj1, *__it1), std::invoke(__proj2, *__it2)))
+        {
+            if (__out_it != __out_end)
+            {
+                *__out_it = *__it1;
+                ++__it1;
+                ++__out_it;
+            }
+            else
+                break;
+        }
+        else if (std::invoke(__comp, std::invoke(__proj2, *__it2), std::invoke(__proj1, *__it1)))
+        {
+            if (__out_it != __out_end)
+            {
+                *__out_it = *__it2;
+                ++__it2;
+                ++__out_it;
+            }
+            else
+                break;
+        }
+        else
+        {
+            ++__it1;
+            ++__it2;
+        }
+    }
+
+    // 2. Copying the residual elements if one of the input sequences is exhausted
+    const DifferenceType __remaining_capacity1 = __out_end - __out_it;
+    const DifferenceType __copy_n1 = __end1 - __it1;
+    auto __copy1 = std::ranges::copy(__it1, __it1 + std::min(__copy_n1, __remaining_capacity1), __out_it);
+
+    const DifferenceType __remaining_capacity2 = __out_end - __copy1.out;
+    const DifferenceType __copy_n2 = __end2 - __it2;
+    auto __copy2 = std::ranges::copy(__it2, __it2 + std::min(__copy_n2, __remaining_capacity2), __copy1.out);
+
+    return {__copy1.in, __copy2.in, __copy2.out};
+}
+
 template <typename _R1, typename _R2, typename _OutRange, typename _Comp, typename _Proj1, typename _Proj2>
 __set_symmetric_difference_return_t<_R1, _R2, _OutRange>
 __brick_set_symmetric_difference(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1,
                                  _Proj2 __proj2,
                                  /*__is_vector=*/std::false_type) noexcept
 {
-    return std::ranges::set_symmetric_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2),
-                                                 std::ranges::begin(__out_r), __comp, __proj1, __proj2);
+    return __serial_set_symmetric_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2),
+                                             std::forward<_OutRange>(__out_r), __comp, __proj1, __proj2);
 }
 
 template <typename _R1, typename _R2, typename _OutRange, typename _Comp, typename _Proj1, typename _Proj2>
@@ -1144,8 +1337,8 @@ __brick_set_symmetric_difference(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _C
                                  /*__is_vector=*/std::true_type) noexcept
 {
     _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial");
-    return std::ranges::set_symmetric_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2),
-                                                 std::ranges::begin(__out_r), __comp, __proj1, __proj2);
+    return __serial_set_symmetric_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2),
+                                             std::forward<_OutRange>(__out_r), __comp, __proj1, __proj2);
 }
 
 template <typename _Tag, typename _ExecutionPolicy, typename _R1, typename _R2, typename _OutRange, typename _Comp,
@@ -1154,7 +1347,7 @@ __set_symmetric_difference_return_t<_R1, _R2, _OutRange>
 __pattern_set_symmetric_difference(_Tag, _ExecutionPolicy&&, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp,
                                    _Proj1 __proj1, _Proj2 __proj2)
 {
-    static_assert(__is_serial_tag_v<_Tag>);
+    static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>);
 
     return __brick_set_symmetric_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2),
                                             std::forward<_OutRange>(__out_r), __comp, __proj1, __proj2,
@@ -1167,35 +1360,23 @@ __set_symmetric_difference_return_t<_R1, _R2, _OutRange>
 __pattern_set_symmetric_difference(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _R1&& __r1, _R2&& __r2,
                                    _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2)
 {
-    using _RandomAccessIterator1 = std::ranges::iterator_t<_R1>;
-    using _RandomAccessIterator2 = std::ranges::iterator_t<_R2>;
-    using _Tp = std::ranges::range_value_t<_OutRange>;
-
-    const auto __n1 = std::ranges::size(__r1);
-    const auto __n2 = std::ranges::size(__r2);
+    auto [__first1, __last1, __n1] = oneapi::dpl::__ranges::__get_range_bounds_n(__r1);
+    auto [__first2, __last2, __n2] = oneapi::dpl::__ranges::__get_range_bounds_n(__r2);
+    auto [__result1, __result2] = oneapi::dpl::__ranges::__get_range_bounds(__out_r);
 
     // use serial algorithm
-    if (__n1 + __n2 <= oneapi::dpl::__internal::__set_algo_cut_off)
-        return std::ranges::set_symmetric_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2),
-                                                     std::ranges::begin(__out_r), __comp, __proj1, __proj2);
-
-    auto __first1 = std::ranges::begin(__r1);
-    auto __last1 = __first1 + __n1;
-    auto __first2 = std::ranges::begin(__r2);
-    auto __last2 = __first2 + __n2;
-    auto __result = std::ranges::begin(__out_r);
-
-    auto __out_last = oneapi::dpl::__internal::__parallel_set_union_op(
-        __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result,
-        [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2,
-           _RandomAccessIterator2 __last2, _Tp* __result, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) {
-            return oneapi::dpl::__utils::__set_symmetric_difference_construct(
-                __first1, __last1, __first2, __last2, __result,
-                oneapi::dpl::__internal::__BrickCopyConstruct<_IsVector>(), __comp, __proj1, __proj2);
-        },
-        __comp, __proj1, __proj2);
-
-    return {__last1, __last2, __out_last};
+    if (!oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__n1 + __n2))
+        return __serial_set_symmetric_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2),
+                                                 std::forward<_OutRange>(__out_r), __comp, __proj1, __proj2);
+
+    return oneapi::dpl::__internal::__parallel_set_union_op</*__Bounded*/ true>(
+               __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result1,
+               __result2, __comp, __proj1, __proj2,
+               [](auto&&... __args) {
+                   return oneapi::dpl::__utils::__set_symmetric_difference_construct<__BrickCopyConstruct<_IsVector>>(
+                       std::forward<decltype(__args)>(__args)...);
+               })
+        .template __get_reached_in1_in2_out<__set_symmetric_difference_return_t<_R1, _R2, _OutRange>>();
 }
 
 //---------------------------------------------------------------------------------------------------------------------
diff --git a/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h b/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h
index 561b80884ca..3176adf750b 100644
--- a/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h
+++ b/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h
@@ -897,8 +897,7 @@ struct __set_difference_fn
                  std::mergeable<std::ranges::iterator_t<_R1>, std::ranges::iterator_t<_R2>,
                                 std::ranges::iterator_t<_OutRange>, _Comp, _Proj1, _Proj2>
 
-    std::ranges::set_difference_result<std::ranges::borrowed_iterator_t<_R1>,
-                                       std::ranges::borrowed_iterator_t<_OutRange>>
+    oneapi::dpl::__internal::__ranges::__set_difference_return_t<_R1, _R2, _OutRange>
     operator()(_ExecutionPolicy&& __exec, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp = {},
                _Proj1 __proj1 = {}, _Proj2 __proj2 = {}) const
     {
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index 9f2e53cdcbd..095ab4be012 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -1102,6 +1102,7 @@ __pattern_set_intersection(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec,
         oneapi::dpl::__ranges::__get_subscription_view(__r1), oneapi::dpl::__ranges::__get_subscription_view(__r2),
         oneapi::dpl::__ranges::__get_subscription_view(__out_r), __comp, __proj1, __proj2);
 
+    // TODO incorrect approach for new rules of stop positions for std::ranges::set_intersection
     return {__first1 + __n1, __first2 + __n2, __result + __result_size};
 }
 
@@ -1111,18 +1112,21 @@ struct __set_difference_copy_case_1;
 
 template <typename _BackendTag, typename _ExecutionPolicy, typename _R1, typename _R2, typename _OutRange,
           typename _Comp, typename _Proj1, typename _Proj2>
-std::ranges::set_difference_result<std::ranges::borrowed_iterator_t<_R1>, std::ranges::borrowed_iterator_t<_OutRange>>
+oneapi::dpl::__internal::__ranges::__set_difference_return_t<_R1, _R2, _OutRange>
 __pattern_set_difference(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _R1&& __r1, _R2&& __r2,
                          _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2)
 {
     const auto __first1 = std::ranges::begin(__r1);
+    const auto __first2 = std::ranges::begin(__r2);
     const auto __result = std::ranges::begin(__out_r);
 
     const auto __n1 = oneapi::dpl::__ranges::__size(__r1);
+    const auto __n2 = oneapi::dpl::__ranges::__size(__r2);
 
     // {} \ {2}: the difference is empty
     if (__n1 == 0)
-        return {__first1, __result};
+        return oneapi::dpl::__internal::__ranges::__create_set_difference_result<_R1, _R2, _OutRange>(
+            __first1, __first2, __result);
 
     // {1} \ {}: the difference is {1}
     if (oneapi::dpl::__ranges::__empty(__r2))
@@ -1135,16 +1139,18 @@ __pattern_set_difference(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e
             oneapi::dpl::__ranges::__get_subscription_view(__r1),
             oneapi::dpl::__ranges::__get_subscription_view(__out_r));
 
-        return {__first1 + __n1, __result + __idx};
+        return oneapi::dpl::__internal::__ranges::__create_set_difference_result<_R1, _R2, _OutRange>(
+            __first1 + __n1, __first2, __result + __idx);
     }
 
     const std::size_t __result_size = __par_backend_hetero::__parallel_set_op<unseq_backend::_DifferenceTag>(
         _BackendTag{}, unseq_backend::_DifferenceTag{}, std::forward<_ExecutionPolicy>(__exec),
-        oneapi::dpl::__ranges::__get_subscription_view(__r1),
-        oneapi::dpl::__ranges::__get_subscription_view(std::forward<_R2>(__r2)),
+        oneapi::dpl::__ranges::__get_subscription_view(__r1), oneapi::dpl::__ranges::__get_subscription_view(__r2),
         oneapi::dpl::__ranges::__get_subscription_view(__out_r), __comp, __proj1, __proj2);
 
-    return {__first1 + __n1, __result + __result_size};
+    // TODO the second argument isn't correct for now
+    return oneapi::dpl::__internal::__ranges::__create_set_difference_result<_R1, _R2, _OutRange>(
+        __first1 + __n1, __first2 + __n2, __result + __result_size);
 }
 
 //Dummy names to avoid kernel problems
diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h
index 7957e49a56f..003f03293b8 100644
--- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h
+++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h
@@ -478,7 +478,7 @@ class __func_task : public __task
     execute()
     {
         return _M_func(this);
-    };
+    }
 
   public:
     template <typename _Fn>
diff --git a/include/oneapi/dpl/pstl/parallel_backend_utils.h b/include/oneapi/dpl/pstl/parallel_backend_utils.h
index af85a4fc5e9..35ef85669cd 100644
--- a/include/oneapi/dpl/pstl/parallel_backend_utils.h
+++ b/include/oneapi/dpl/pstl/parallel_backend_utils.h
@@ -24,6 +24,7 @@
 #include <utility>
 #include <vector>
 #include <cassert>
+#include <cstdint> // for std::uint8_t
 #include "utils.h"
 #include "memory_fwd.h"
 #include "functional_impl.h" // for oneapi::dpl::identity, std::invoke
@@ -217,123 +218,323 @@ struct __serial_move_merge
     }
 };
 
-template <typename _ForwardIterator1, typename _ForwardIterator2, typename _OutputIterator,
-          typename _CopyConstructRange, typename _Compare, typename _Proj1, typename _Proj2>
-_OutputIterator
+enum class __parallel_set_op_mask : std::uint8_t
+{
+    eNone = 0x00,    // initial state
+    eData1 = 0x01,   // mask for first input data item usage
+    eData2 = 0x02,   // mask for second input data item usage
+    eDataOut = 0x04, // mask for output data item usage
+
+    eBoth = 0x03,     // eData1 | eData2: mask for both input data items usage
+    eData1Out = 0x05, // eData1 | eDataOut: mask for copy data item from the first data set into output
+    eData2Out = 0x06, // eData2 | eDataOut: mask for copy data item from the second data set into output
+    eBothOut = 0x07   // eBoth  | eDataOut: mask for copy data item from the first and the second data set into output
+};
+
+inline std::nullptr_t
+__set_iterator_mask(std::nullptr_t, __parallel_set_op_mask) noexcept
+{
+    return nullptr;
+}
+
+inline __parallel_set_op_mask*
+__set_iterator_mask(__parallel_set_op_mask* __mask, __parallel_set_op_mask __state) noexcept
+{
+    *__mask = __state;
+    return ++__mask;
+}
+
+template <typename _Size>
+inline std::nullptr_t
+__set_iterator_mask_n(std::nullptr_t, __parallel_set_op_mask, _Size) noexcept
+{
+    return nullptr;
+}
+
+template <typename _Size>
+inline __parallel_set_op_mask*
+__set_iterator_mask_n(__parallel_set_op_mask* __mask, __parallel_set_op_mask __state, _Size __count) noexcept
+{
+    for (_Size __i = 0; __i < __count; ++__i)
+        __mask[__i] = __state;
+
+    return __mask + __count;
+}
+
+struct _SetOpDiscardIterator
+{
+    using iterator_category = std::output_iterator_tag;
+    using difference_type = std::ptrdiff_t;
+    using value_type = void;
+    using pointer = void;
+    using reference = void;
+
+    _SetOpDiscardIterator&
+    operator*() noexcept
+    {
+        return *this;
+    }
+
+    _SetOpDiscardIterator&
+    operator++() noexcept
+    {
+        return *this;
+    }
+
+    _SetOpDiscardIterator
+    operator++(int) noexcept
+    {
+        return *this;
+    }
+
+    template <typename T>
+    _SetOpDiscardIterator&
+    operator=(const T&) noexcept
+    {
+        return *this;
+    }
+};
+
+template <typename _InputIterator, typename _OutputIterator>
+struct _UninitializedCopyItem
+{
+    using _OutValueType = typename std::iterator_traits<_OutputIterator>::value_type;
+
+    void
+    operator()(_InputIterator __it_in, _OutputIterator __it_out) const
+    {
+        if constexpr (!std::is_same_v<_OutputIterator, _SetOpDiscardIterator>)
+        {
+            // We should use placement new here because this method really works with raw uninitialized memory
+            new (std::addressof(*__it_out)) _OutValueType(*__it_in);
+        }
+    }
+};
+
+template <typename _CopyConstructRange>
+struct _CopyConstructRangeOpWrapper
+{
+    _CopyConstructRange _cc_range;
+
+    template <typename _InputIterator>
+    _SetOpDiscardIterator
+    operator()(_InputIterator, _InputIterator, _SetOpDiscardIterator)
+    {
+        return _SetOpDiscardIterator{};
+    }
+
+    template <typename _InputIterator, typename _OutputIterator>
+    _OutputIterator
+    operator()(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
+    {
+        return _cc_range(__first, __last, __result);
+    }
+};
+
+template <typename _ForwardIterator1, typename _ForwardIterator2, typename _OutputIterator, typename _MaskIterator>
+using _union_construct_return_t = std::tuple<_ForwardIterator1, _ForwardIterator2, _OutputIterator, _MaskIterator>;
+
+template <typename _CopyConstructRange, typename _ForwardIterator1, typename _ForwardIterator2,
+          typename _OutputIterator, typename _Compare, typename _Proj1, typename _Proj2, typename _MaskIterator>
+_union_construct_return_t<_ForwardIterator1, _ForwardIterator2, _OutputIterator, _MaskIterator>
 __set_union_construct(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2,
-                      _ForwardIterator2 __last2, _OutputIterator __result, _CopyConstructRange __cc_range,
-                      _Compare __comp, _Proj1 __proj1, _Proj2 __proj2)
+                      _ForwardIterator2 __last2, _OutputIterator __result, _Compare __comp, _Proj1 __proj1,
+                      _Proj2 __proj2, _MaskIterator __mask)
 {
-    using _Tp = typename ::std::iterator_traits<_OutputIterator>::value_type;
+    _UninitializedCopyItem<_ForwardIterator1, _OutputIterator> _uninitialized_copy_from1;
+    _UninitializedCopyItem<_ForwardIterator2, _OutputIterator> _uninitialized_copy_from2;
+
+    _CopyConstructRangeOpWrapper<_CopyConstructRange> __cc_range;
 
     for (; __first1 != __last1; ++__result)
     {
         if (__first2 == __last2)
-            return __cc_range(__first1, __last1, __result);
+        {
+            __mask = __set_iterator_mask_n(__mask, __parallel_set_op_mask::eData1Out, __last1 - __first1);
+            __result = __cc_range(__first1, __last1, __result);
+
+            return {__last1, __first2, __result, __mask};
+        }
+
         if (std::invoke(__comp, std::invoke(__proj2, *__first2), std::invoke(__proj1, *__first1)))
         {
-            ::new (::std::addressof(*__result)) _Tp(*__first2);
+            _uninitialized_copy_from2(__first2, __result);
             ++__first2;
+            __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData2Out);
         }
         else
         {
-            ::new (::std::addressof(*__result)) _Tp(*__first1);
+            _uninitialized_copy_from1(__first1, __result);
             if (!std::invoke(__comp, std::invoke(__proj1, *__first1), std::invoke(__proj2, *__first2)))
+            {
                 ++__first2;
+                __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eBothOut);
+            }
+            else
+            {
+                __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData1Out);
+            }
             ++__first1;
         }
     }
-    return __cc_range(__first2, __last2, __result);
+
+    __mask = __set_iterator_mask_n(__mask, __parallel_set_op_mask::eData2Out, __last2 - __first2);
+    __result = __cc_range(__first2, __last2, __result);
+
+    return {__first1, __last2, __result, __mask};
 }
 
-template <typename _ForwardIterator1, typename _ForwardIterator2, typename _OutputIterator, typename _CopyFunc,
-          typename _Compare, typename _Proj1, typename _Proj2>
-_OutputIterator
+template <typename _CopyFunc>
+struct CopyOpWrapper
+{
+    _CopyFunc _copy;
+
+    template <typename _InputIterator>
+    void
+    operator()(_InputIterator, _SetOpDiscardIterator) const
+    {
+    }
+
+    template <typename _InputIterator, typename _OutputIterator>
+    void
+    operator()(_InputIterator __it_in, _OutputIterator __it_out) const
+    {
+        _copy(*__it_in, *__it_out);
+    }
+};
+
+template <typename _CopyFunc, typename _ForwardIterator1, typename _ForwardIterator2, typename _OutputIterator,
+          typename _Compare, typename _Proj1, typename _Proj2, typename _MaskIterator>
+_union_construct_return_t<_ForwardIterator1, _ForwardIterator2, _OutputIterator, _MaskIterator>
 __set_intersection_construct(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2,
-                             _ForwardIterator2 __last2, _OutputIterator __result, _CopyFunc _copy, _Compare __comp,
-                             _Proj1 __proj1, _Proj2 __proj2)
+                             _ForwardIterator2 __last2, _OutputIterator __result, _Compare __comp, _Proj1 __proj1,
+                             _Proj2 __proj2, _MaskIterator __mask)
 {
+    CopyOpWrapper<_CopyFunc> __copy;
+
     while (__first1 != __last1 && __first2 != __last2)
     {
         if (std::invoke(__comp, std::invoke(__proj1, *__first1), std::invoke(__proj2, *__first2)))
+        {
             ++__first1;
+            __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData1);
+        }
         else if (std::invoke(__comp, std::invoke(__proj2, *__first2), std::invoke(__proj1, *__first1)))
+        {
             ++__first2;
+            __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData2);
+        }
         else
         {
-            _copy(*__first1, *__result);
-
+            __copy(__first1, __result);
             ++__first1;
             ++__first2;
             ++__result;
+            __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eBothOut);
         }
     }
-    return __result;
+
+    return {__first1, __first2, __result, __mask};
 }
 
-template <typename _ForwardIterator1, typename _ForwardIterator2, typename _OutputIterator,
-          typename _CopyConstructRange, typename _Compare, typename _Proj1, typename _Proj2>
-_OutputIterator
+template <typename _CopyConstructRange, typename _ForwardIterator1, typename _ForwardIterator2,
+          typename _OutputIterator, typename _Compare, typename _Proj1, typename _Proj2, typename _MaskIterator>
+_union_construct_return_t<_ForwardIterator1, _ForwardIterator2, _OutputIterator, _MaskIterator>
 __set_difference_construct(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2,
-                           _ForwardIterator2 __last2, _OutputIterator __result, _CopyConstructRange __cc_range,
-                           _Compare __comp, _Proj1 __proj1, _Proj2 __proj2)
+                           _ForwardIterator2 __last2, _OutputIterator __result, _Compare __comp, _Proj1 __proj1,
+                           _Proj2 __proj2, _MaskIterator __mask)
 {
-    using _Tp = typename ::std::iterator_traits<_OutputIterator>::value_type;
+    _UninitializedCopyItem<_ForwardIterator1, _OutputIterator> _uninitialized_copy_from1;
+
+    _CopyConstructRangeOpWrapper<_CopyConstructRange> __cc_range;
 
-    for (; __first1 != __last1;)
+    while (__first1 != __last1)
     {
         if (__first2 == __last2)
-            return __cc_range(__first1, __last1, __result);
+        {
+            __mask = __set_iterator_mask_n(__mask, __parallel_set_op_mask::eData1Out, __last1 - __first1);
+            __result = __cc_range(__first1, __last1, __result);
+
+            return {__last1, __first2, __result, __mask};
+        }
 
         if (std::invoke(__comp, std::invoke(__proj1, *__first1), std::invoke(__proj2, *__first2)))
         {
-            ::new (::std::addressof(*__result)) _Tp(*__first1);
+            _uninitialized_copy_from1(__first1, __result);
             ++__result;
             ++__first1;
+            __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData1Out);
         }
         else
         {
             if (!std::invoke(__comp, std::invoke(__proj2, *__first2), std::invoke(__proj1, *__first1)))
+            {
                 ++__first1;
+                __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eBoth);
+            }
+            else
+            {
+                __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData2);
+            }
             ++__first2;
         }
     }
-    return __result;
+
+    return {__first1, __first2, __result, __mask};
 }
 
-template <typename _ForwardIterator1, typename _ForwardIterator2, typename _OutputIterator,
-          typename _CopyConstructRange, typename _Compare, typename _Proj1, typename _Proj2>
-_OutputIterator
+template <typename _CopyConstructRange, typename _ForwardIterator1, typename _ForwardIterator2,
+          typename _OutputIterator, typename _Compare, typename _Proj1, typename _Proj2, typename _MaskIterator>
+_union_construct_return_t<_ForwardIterator1, _ForwardIterator2, _OutputIterator, _MaskIterator>
 __set_symmetric_difference_construct(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2,
-                                     _ForwardIterator2 __last2, _OutputIterator __result,
-                                     _CopyConstructRange __cc_range, _Compare __comp, _Proj1 __proj1, _Proj2 __proj2)
+                                     _ForwardIterator2 __last2, _OutputIterator __result, _Compare __comp,
+                                     _Proj1 __proj1, _Proj2 __proj2, _MaskIterator __mask)
 {
-    using _Tp = typename ::std::iterator_traits<_OutputIterator>::value_type;
+    _UninitializedCopyItem<_ForwardIterator1, _OutputIterator> _uninitialized_copy_from1;
+    _UninitializedCopyItem<_ForwardIterator2, _OutputIterator> _uninitialized_copy_from2;
+
+    _CopyConstructRangeOpWrapper<_CopyConstructRange> __cc_range;
 
-    for (; __first1 != __last1;)
+    while (__first1 != __last1)
     {
         if (__first2 == __last2)
-            return __cc_range(__first1, __last1, __result);
+        {
+            __mask = __set_iterator_mask_n(__mask, __parallel_set_op_mask::eData1Out, __last1 - __first1);
+            __result = __cc_range(__first1, __last1, __result);
+
+            return {__last1, __first2, __result, __mask};
+        }
 
         if (std::invoke(__comp, std::invoke(__proj1, *__first1), std::invoke(__proj2, *__first2)))
         {
-            ::new (::std::addressof(*__result)) _Tp(*__first1);
+            // We should use placement new here because this method really works with raw uninitialized memory
+            _uninitialized_copy_from1(__first1, __result);
             ++__result;
             ++__first1;
+            __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData1Out);
         }
         else
         {
             if (std::invoke(__comp, std::invoke(__proj2, *__first2), std::invoke(__proj1, *__first1)))
             {
-                ::new (::std::addressof(*__result)) _Tp(*__first2);
+                // We should use placement new here because this method really works with raw uninitialized memory
+                _uninitialized_copy_from2(__first2, __result);
                 ++__result;
+                __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData2Out);
             }
             else
+            {
                 ++__first1;
+                __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eBoth);
+            }
             ++__first2;
         }
     }
-    return __cc_range(__first2, __last2, __result);
+
+    __mask = __set_iterator_mask_n(__mask, __parallel_set_op_mask::eData2Out, __last2 - __first2);
+    __result = __cc_range(__first2, __last2, __result);
+
+    return {__first1, __last2, __result, __mask};
 }
 
 template <template <typename, typename...> typename _Concrete, typename _ValueType, typename... _Args>
@@ -402,6 +603,38 @@ struct __enumerable_thread_local_storage_base
     const std::tuple<_Args...> __args;
 };
 
+template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessOutputIterator>
+struct __set_operations_result
+{
+    _RandomAccessIterator1 __in1;
+    _RandomAccessIterator2 __in2;
+    _RandomAccessOutputIterator __it_out;
+
+    // Get reached input1, input2 and output iterators
+    template <typename TResult>
+    TResult
+    __get_reached_in1_in2_out() const
+    {
+        return {__in1, __in2, __it_out};
+    }
+
+    // Get reached output iterator
+    _RandomAccessOutputIterator
+    __get_reached_out() const
+    {
+        return __it_out;
+    }
+
+    __set_operations_result<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessOutputIterator>
+    operator+(std::tuple<typename std::iterator_traits<_RandomAccessIterator1>::difference_type,
+                         typename std::iterator_traits<_RandomAccessIterator2>::difference_type,
+                         typename std::iterator_traits<_RandomAccessOutputIterator>::difference_type>
+                  __offsets) const
+    {
+        return {__in1 + std::get<0>(__offsets), __in2 + std::get<1>(__offsets), __it_out + std::get<2>(__offsets)};
+    }
+};
+
 } // namespace __utils
 } // namespace dpl
 } // namespace oneapi
diff --git a/include/oneapi/dpl/pstl/utils_ranges.h b/include/oneapi/dpl/pstl/utils_ranges.h
index 2453d0288f8..135540a5d4d 100644
--- a/include/oneapi/dpl/pstl/utils_ranges.h
+++ b/include/oneapi/dpl/pstl/utils_ranges.h
@@ -832,6 +832,30 @@ __get_subscription_view(_View&& __view)
     // to provide operator[] access and extend lifetime if necessary (for temporary ranges).
     return __subscription_impl_view_simple<_ViewInstance>(__view);
 }
+
+// Returns begin and end of the range
+template <typename _Range>
+    requires std::ranges::random_access_range<std::remove_cvref_t<_Range>> &&
+             std::ranges::sized_range<std::remove_cvref_t<_Range>>
+auto
+__get_range_bounds(_Range&& __rng)
+{
+    const auto __size = oneapi::dpl::__ranges::__size(__rng);
+    auto __begin = oneapi::dpl::__ranges::__begin(__rng);
+    return std::make_tuple(__begin, __begin + __size);
+}
+
+// Returns begin, end and size of the range
+template <typename _Range>
+    requires std::ranges::random_access_range<std::remove_cvref_t<_Range>> &&
+             std::ranges::sized_range<std::remove_cvref_t<_Range>>
+auto
+__get_range_bounds_n(_Range&& __rng)
+{
+    const auto __size = oneapi::dpl::__ranges::__size(__rng);
+    auto __begin = oneapi::dpl::__ranges::__begin(__rng);
+    return std::make_tuple(__begin, __begin + __size, __size);
+}
 #endif // _ONEDPL_CPP20_RANGES_PRESENT
 
 } // namespace __ranges
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index fc517f500e1..e8cf65f1970 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -188,6 +188,9 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators)
         "transform_reduce.pass" "transform_reduce.pass.coal" "transform_scan.pass" "algorithm.pass"
         "execution.pass" "functional.pass" "algorithms_redirection.pass" "usm_memory_replacement.pass")
 
+    # Tests that are built in two variants: with and without ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT
+    set(set_difference_cpp23_variant_tests "std_ranges_set_difference.pass")
+
     set(extra_test_label "")
     if (_test_name IN_LIST pstloffload_smoke_tests)
         set(extra_test_label "pstloffload_smoke_tests")
@@ -203,6 +206,10 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators)
     elseif(_test_name STREQUAL "free_after_unload.pass")
         onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}")
         onedpl_construct_exec(${test_source_file} ${_test_name}.after_pstl_offload ${switch_off_checked_iterators} "" "${extra_test_label}")
+    elseif (_test_name IN_LIST set_difference_cpp23_variant_tests)
+        onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-DONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT=0" "${extra_test_label}")
+        string(REPLACE "\.pass" "_cpp23\.pass" _test_name ${_test_name})
+        onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-DONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT=1" "${extra_test_label}")
     else()
         onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}")
     endif()
diff --git a/test/general/implementation_details/test_set_op_details.pass.cpp b/test/general/implementation_details/test_set_op_details.pass.cpp
new file mode 100644
index 00000000000..d6f900f0401
--- /dev/null
+++ b/test/general/implementation_details/test_set_op_details.pass.cpp
@@ -0,0 +1,1108 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "support/test_config.h"
+#include "support/utils.h"
+
+#include <memory> // for std::allocator, std::destroy
+
+#if _ENABLE_STD_RANGES_TESTING
+
+#include <oneapi/dpl/pstl/parallel_backend_utils.h>
+
+#include <vector>
+#include <functional>
+#include <ranges>
+#include <iterator>
+#include <algorithm> // for std::count_if
+
+template <typename Container1, typename Container2>
+std::size_t
+evalContainerSize(const Container1& cont1, const Container2& cont2)
+{
+    return cont1.size() + cont2.size();
+}
+
+template <typename Container1, typename Container2>
+std::size_t
+evalMaskSize(const Container1& cont1, const Container2& cont2)
+{
+    return cont1.size() + cont2.size();
+}
+
+struct UninitializedCopyValueOp
+{
+    template <typename _SourceT, typename _TargetT>
+    void
+    operator()(_SourceT&& __source, _TargetT& __target) const
+    {
+        new (std::addressof(__target)) _TargetT(std::forward<_SourceT>(__source));
+    }
+};
+
+// For details please see description of the enum oneapi::dpl::__utils::__parallel_set_op_mask
+using MaskContainer = std::vector<oneapi::dpl::__utils::__parallel_set_op_mask>;
+
+// Container with uninitialized memory, used for testing set operations construction algorithms with output range without enough capacity
+template <typename T>
+class UninitializedMemoryContainer
+{
+    std::size_t _capacity = {};
+    std::allocator<T> _allocator;
+    T* _ptr = nullptr;
+
+  public:
+    explicit UninitializedMemoryContainer(std::size_t __n) : _capacity(__n), _ptr(_allocator.allocate(__n)) {}
+
+    ~UninitializedMemoryContainer() { _allocator.deallocate(_ptr, _capacity); }
+
+    // Non-copyable
+    UninitializedMemoryContainer(const UninitializedMemoryContainer&) = delete;
+    UninitializedMemoryContainer&
+    operator=(const UninitializedMemoryContainer&) = delete;
+
+    T*
+    begin() noexcept
+    {
+        return _ptr;
+    }
+
+    T*
+    end() noexcept
+    {
+        return _ptr + _capacity;
+    }
+
+    // Explicitly destroy the constructed range [begin(), __end) before destruction
+    void
+    destroy_range(T* __end) noexcept
+    {
+        std::destroy(_ptr, __end);
+    }
+};
+
+[[maybe_unused]] constexpr oneapi::dpl::__utils::__parallel_set_op_mask    D1 = oneapi::dpl::__utils::__parallel_set_op_mask::eData1;
+constexpr oneapi::dpl::__utils::__parallel_set_op_mask    D2 = oneapi::dpl::__utils::__parallel_set_op_mask::eData2;
+constexpr oneapi::dpl::__utils::__parallel_set_op_mask   D12 = oneapi::dpl::__utils::__parallel_set_op_mask::eBoth;
+constexpr oneapi::dpl::__utils::__parallel_set_op_mask   D1O = oneapi::dpl::__utils::__parallel_set_op_mask::eData1Out;
+constexpr oneapi::dpl::__utils::__parallel_set_op_mask   D2O = oneapi::dpl::__utils::__parallel_set_op_mask::eData2Out;
+constexpr oneapi::dpl::__utils::__parallel_set_op_mask  D12O = oneapi::dpl::__utils::__parallel_set_op_mask::eBothOut;
+
+using BrickCopy = oneapi::dpl::__internal::__BrickCopyConstruct<std::false_type>;
+
+// The rules for testing set_union described at https://eel.is/c++draft/set.union
+void
+test_set_union_construct()
+{
+    using DataType = TestUtils::SetDataItem<int>;
+    using Container = std::vector<DataType>;
+    
+    // the first case - output range has enough capacity
+    {
+        const Container       cont1 = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}, {4, 3, 1}, {5, 4, 1}                      };
+        const Container       cont2 = {                      {3, 0, 2}, {4, 1, 2}, {5, 2, 2}, {6, 3, 2}, {7, 4, 2}};
+        const MaskContainer maskExp = {      D1O,       D1O,      D12O,      D12O,      D12O,       D2O,       D2O};
+        const Container  contOutExp = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}, {4, 3, 1}, {5, 4, 1}, {6, 3, 2}, {7, 4, 2}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_union_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // the first case - output range has enough capacity - SWAP input ranges data
+    {
+        const Container       cont1 = {                      {3, 0, 1}, {4, 1, 1}, {5, 2, 1}, {6, 3, 1}, {7, 4, 1}};
+        const Container       cont2 = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}, {4, 3, 2}, {5, 4, 2}                      };
+        const MaskContainer maskExp = {      D2O,       D2O,      D12O,      D12O,      D12O,       D1O,       D1O};
+        const Container  contOutExp = {{1, 0, 2}, {2, 1, 2}, {3, 0, 1}, {4, 1, 1}, {5, 2, 1}, {6, 3, 1}, {7, 4, 1}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_union_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+}
+
+void
+test_set_union_construct_edge_cases()
+{
+    using DataType = TestUtils::SetDataItem<int>;
+    using Container = std::vector<DataType>;
+
+    // The case: both containers are empty
+    {
+        const Container cont1       = { };
+        const Container cont2       = { };
+        const MaskContainer maskExp = { };
+        const Container contOutExp  = { };
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_union_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: the first container is empty
+    {
+        const Container cont1       = {                               };
+        const Container cont2       = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}};
+        const MaskContainer maskExp = {      D2O,       D2O,       D2O};
+        const Container contOutExp  = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_union_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: the second container is empty
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}};
+        const Container cont2       = {                               };
+        const MaskContainer maskExp = {      D1O,       D1O,       D1O};
+        const Container contOutExp  = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_union_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: one item in the first container
+    {
+        const Container cont1       = {           {2, 0, 1}           };
+        const Container cont2       = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}};
+        const MaskContainer maskExp = {      D2O,      D12O,       D2O};
+        const Container contOutExp  = {{1, 0, 2}, {2, 0, 1}, {3, 2, 2}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_union_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: one item in the second container
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}};
+        const Container cont2       = {           {2, 0, 2}           };
+        const MaskContainer maskExp = {      D1O,      D12O,       D1O};
+        const Container contOutExp  = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_union_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: all items are equal but the last item in the first container is unique
+    {
+        const Container cont1       = {{2, 0, 1}, {2, 1, 1}, {2, 2, 1}, {3, 3, 1}};
+        const Container cont2       = {{2, 0, 2}, {2, 1, 2}, {2, 2, 2}           };
+        const MaskContainer maskExp = {     D12O ,     D12O,      D12O,       D1O};
+        const Container contOutExp  = {{2, 0, 1}, {2, 1, 1}, {2, 2, 1}, {3, 3, 1}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_union_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: both containers have the same items
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}};
+        const Container cont2       = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}};
+        const MaskContainer maskExp = {     D12O,      D12O,      D12O};
+        const Container contOutExp  = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_union_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: all items in the first container less then in the second one
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}                                 };
+        const Container cont2       = {                                 {4, 0, 2}, {5, 1, 2}, {6, 2, 2}};
+        const MaskContainer maskExp = {      D1O,       D1O,       D1O,       D2O,       D2O,       D2O};
+        const Container contOutExp  = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}, {4, 0, 2}, {5, 1, 2}, {6, 2, 2}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_union_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: the first container has duplicated items
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {2, 2, 1}, {3, 3, 1}           };
+        const Container cont2       = {           {2, 0, 2},            {3, 1, 2}, {4, 2, 2}};
+        const MaskContainer maskExp = {      D1O,      D12O,       D1O,      D12O,       D2O};
+        const Container contOutExp  = {{1, 0, 1}, {2, 1, 1}, {2, 2, 1}, {3, 3, 1}, {4, 2, 2}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_union_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+}
+
+// The rules for testing set_intersection described at https://eel.is/c++draft/set.intersection
+void
+test_set_intersection_construct()
+{
+    using DataType = TestUtils::SetDataItem<int>;
+    using Container = std::vector<DataType>;
+
+    // the first case - output range has enough capacity
+    {
+        const Container cont1       = {                      {3, 0, 2}, {4, 1, 2}, {5, 2, 2}, {6, 3, 2}, {7, 4, 2}};
+        const Container cont2       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}, {4, 3, 1}, {5, 4, 1}                      };
+        const MaskContainer maskExp = {       D2,        D2,      D12O,      D12O,      D12O                      };
+        const Container contOutExp  = {                      {3, 0, 2}, {4, 1, 2}, {5, 2, 2}                      };
+
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_intersection_construct<UninitializedCopyValueOp>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ(3, std::distance(contOut.begin(), out), "incorrect state of out for __set_intersection_construct");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        // Truncate output from out till the end to avoid compare error
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out),
+                         "wrong result of result contOut after __set_intersection_construct");
+
+        contOut.destroy_range(out);
+    }
+
+    // the first case - output range has enough capacity - SWAP input ranges data
+    {
+        const Container cont1       = {                      {3, 0, 1}, {4, 1, 1}, {5, 2, 1}, {6, 3, 1}, {7, 4, 1}};
+        const Container cont2       = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}, {4, 3, 2}, {5, 4, 2}                      };
+        const MaskContainer maskExp = {       D2,        D2,      D12O,      D12O,      D12O                      };
+        const Container contOutExp  = {                      {3, 0, 1}, {4, 1, 1}, {5, 2, 1}                      };
+
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_intersection_construct<UninitializedCopyValueOp>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+}
+
+void
+test_set_intersection_construct_edge_cases()
+{
+    using DataType = TestUtils::SetDataItem<int>;
+    using Container = std::vector<DataType>;
+
+    // The case: both containers are empty
+    {
+        const Container cont1       = { };
+        const Container cont2       = { };
+        const MaskContainer maskExp = { };
+        const Container contOutExp  = { };
+
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_intersection_construct<UninitializedCopyValueOp>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: the first container is empty
+    {
+        const Container cont1       = {                               };
+        const Container cont2       = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}};
+        const MaskContainer maskExp = {                               };
+        const Container contOutExp  = {                               };
+
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_intersection_construct<UninitializedCopyValueOp>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: the second container is empty
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}};
+        const Container cont2       = {                               };
+        const MaskContainer maskExp = {                               };
+        const Container contOutExp  = {                               };
+
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_intersection_construct<UninitializedCopyValueOp>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: one item in the first container
+    {
+        const Container cont1       = {           {2, 0, 1}           };
+        const Container cont2       = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}};
+        const MaskContainer maskExp = {       D2,      D12O           };
+        const Container contOutExp  = {           {2, 0, 1}           };
+
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_intersection_construct<UninitializedCopyValueOp>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+}
+
+void
+test_set_difference_construct()
+{
+    using DataType = TestUtils::SetDataItem<int>;
+    using Container = std::vector<DataType>;
+
+    // the first case - output range has enough capacity
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}, {4, 3, 1}, {5, 4, 1}                      };
+        const Container cont2       = {                      {3, 0, 2}, {4, 1, 2}, {5, 2, 2}, {6, 3, 2}, {7, 4, 2}};
+        const MaskContainer maskExp = {      D1O,       D1O,       D12,       D12,       D12                      };
+        const Container contOutExp  = {{1, 0, 1}, {2, 1, 1}                                                       };
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // the first case - output range has enough capacity - SWAP input ranges data
+    {
+        const Container cont1       = {                      {3, 0, 1}, {4, 1, 1}, {5, 2, 1}, {6, 3, 1}, {7, 4, 1}};
+        const Container cont2       = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}, {4, 3, 2}, {5, 4, 2}                      };
+        const MaskContainer maskExp = {       D2,        D2,       D12,       D12,       D12,       D1O,       D1O};
+        const Container contOutExp  = {                                                       {6, 3, 1}, {7, 4, 1}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+}
+
+void
+test_set_difference_construct_edge_cases()
+{
+    using DataType = TestUtils::SetDataItem<int>;
+    using Container = std::vector<DataType>;
+
+    // The case: both containers are empty
+    {
+        const Container cont1       = { };
+        const Container cont2       = { };
+        const MaskContainer maskExp = { };
+        const Container contOutExp  = { };
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: the first container is empty
+    {
+        const Container cont1       = {                               };
+        const Container cont2       = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}};
+        const MaskContainer maskExp = {                               };
+        const Container contOutExp  = {                               };
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: the second container is empty
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}};
+        const Container cont2       = {                               };
+        const MaskContainer maskExp = {      D1O,       D1O,       D1O};
+        const Container contOutExp  = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: one item in the first container
+    {
+        const Container cont1       = {           {2, 0, 1}           };
+        const Container cont2       = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}};
+        const MaskContainer maskExp = {       D2,       D12           };
+        const Container contOutExp  = {                               };
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: one item in the second container
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}};
+        const Container cont2       = {           {2, 0, 2}           };
+        const MaskContainer maskExp = {      D1O,       D12,       D1O};
+        const Container contOutExp  = {{1, 0, 1},            {3, 2, 1}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: all items are equal but the last item in the first container is unique
+    {
+        const Container cont1       = {{2, 0, 1}, {2, 1, 1}, {2, 2, 1}, {3, 3, 1}};
+        const Container cont2       = {{2, 0, 2}, {2, 1, 2}, {2, 2, 2}           };
+        const MaskContainer maskExp = {      D12,       D12,       D12,       D1O};
+        const Container contOutExp  = {                                 {3, 3, 1}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: both containers have the same items
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}};
+        const Container cont2       = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}};
+        const MaskContainer maskExp = {      D12,       D12,       D12};
+        const Container contOutExp  = {                               };
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: all items in the first container less then in the second one
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}                                 };
+        const Container cont2       = {                                 {4, 0, 2}, {5, 1, 2}, {6, 2, 2}};
+        const MaskContainer maskExp = {      D1O,       D1O,       D1O                                 };
+        const Container contOutExp  = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}                                 };
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: the first container has duplicated items
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {2, 2, 1}, {3, 3, 1}           };
+        const Container cont2       = {           {2, 0, 2},            {3, 1, 2}, {4, 2, 2}};
+        const MaskContainer maskExp = {      D1O,       D12,       D1O,       D12           };
+        const Container contOutExp  = {{1, 0, 1},            {2, 2, 1}                      };
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+}
+
+void
+test_set_symmetric_difference_construct()
+{
+    using DataType = TestUtils::SetDataItem<int>;
+    using Container = std::vector<DataType>;
+
+    // the first case - output range has enough capacity
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}, {4, 3, 1}, {5, 4, 1}                      };
+        const Container cont2       = {                      {3, 0, 2}, {4, 1, 2}, {5, 2, 2}, {6, 3, 2}, {7, 4, 2}};
+        const MaskContainer maskExp = {      D1O,       D1O,       D12,       D12,       D12,       D2O,       D2O};
+        const Container contOutExp  = {{1, 0, 1}, {2, 1, 1},                                  {6, 3, 2}, {7, 4, 2}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_symmetric_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // the first case - output range has enough capacity - SWAP input ranges data
+    {
+        const Container cont1       = {                      {3, 0, 1}, {4, 1, 1}, {5, 2, 1}, {6, 3, 1}, {7, 4, 1}};
+        const Container cont2       = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}, {4, 3, 2}, {5, 4, 2}                      };
+        const MaskContainer maskExp = {      D2O,       D2O,       D12,       D12,       D12,       D1O,       D1O};
+        const Container contOutExp  = {{1, 0, 2}, {2, 1, 2},                                  {6, 3, 1}, {7, 4, 1}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_symmetric_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+}
+
+void
+test_set_symmetric_difference_construct_edge_cases()
+{
+    using DataType = TestUtils::SetDataItem<int>;
+    using Container = std::vector<DataType>;
+
+    // The case: the first container is empty
+    {
+        const Container cont1       = {                               };
+        const Container cont2       = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}};
+        const MaskContainer maskExp = {      D2O,       D2O,       D2O};
+        const Container contOutExp  = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_symmetric_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: the second container is empty
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}};
+        const Container cont2       = {                               };
+        const MaskContainer maskExp = {      D1O,       D1O,       D1O};
+        const Container contOutExp  = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_symmetric_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: one item in the first container
+    {
+        const Container cont1       = {           {2, 0, 1}           };
+        const Container cont2       = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}};
+        const MaskContainer maskExp = {      D2O,       D12,       D2O};
+        const Container contOutExp  = {{1, 0, 2},            {3, 2, 2}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_symmetric_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: one item in the second container
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}};
+        const Container cont2       = {           {2, 0, 2}           };
+        const MaskContainer maskExp = {      D1O,       D12,       D1O};
+        const Container contOutExp  = {{1, 0, 1},            {3, 2, 1}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_symmetric_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: all items are equal but the last item in the first container is unique
+    {
+        const Container cont1       = {{2, 0, 1}, {2, 1, 1}, {2, 2, 1}, {3, 3, 1}};
+        const Container cont2       = {{2, 0, 2}, {2, 1, 2}, {2, 2, 2}           };
+        const MaskContainer maskExp = {      D12,       D12,       D12,       D1O};
+        const Container contOutExp  = {                                 {3, 3, 1}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_symmetric_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: both containers have the same items
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}};
+        const Container cont2       = {{1, 0, 2}, {2, 1, 2}, {3, 2, 2}};
+        const MaskContainer maskExp = {      D12,       D12,       D12};
+        const Container contOutExp  = {                               };
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_symmetric_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: all items in the first container less then in the second one
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}                                 };
+        const Container cont2       = {                                 {4, 0, 2}, {5, 1, 2}, {6, 2, 2}};
+        const MaskContainer maskExp = {      D1O,       D1O,       D1O,       D2O,       D2O,       D2O};
+        const Container contOutExp  = {{1, 0, 1}, {2, 1, 1}, {3, 2, 1}, {4, 0, 2}, {5, 1, 2}, {6, 2, 2}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_symmetric_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+
+    // The case: the first container has duplicated items
+    {
+        const Container cont1       = {{1, 0, 1}, {2, 1, 1}, {2, 2, 1}, {3, 3, 1}           };
+        const Container cont2       = {           {2, 0, 2},            {3, 1, 2}, {4, 2, 2}};
+        const MaskContainer maskExp = {      D1O,       D12,       D1O,       D12,       D2O};
+        const Container contOutExp  = {{1, 0, 1},            {2, 2, 1},            {4, 2, 2}};
+        UninitializedMemoryContainer<DataType> contOut(evalContainerSize(cont1, cont2));
+
+        MaskContainer mask(evalMaskSize(cont1, cont2));
+        auto mask_b = mask.data();
+
+        auto [it1, it2, out, mask_e] = oneapi::dpl::__utils::__set_symmetric_difference_construct<BrickCopy>(
+            cont1.begin(), cont1.end(),
+            cont2.begin(), cont2.end(),
+            contOut.begin(),
+            std::less{}, TestUtils::SetDataItemProj{}, TestUtils::SetDataItemProj{},
+            mask_b);
+
+        EXPECT_EQ_RANGES(contOutExp, std::ranges::subrange(contOut.begin(), out), "Incorrect result data state");
+        EXPECT_EQ_RANGES(maskExp, std::ranges::subrange(mask_b, mask_e), "Incorrect mask state");
+
+        contOut.destroy_range(out);
+    }
+}
+#endif // _ENABLE_STD_RANGES_TESTING
+
+int
+main()
+{
+    bool bProcessed = false;
+
+#if _ENABLE_STD_RANGES_TESTING
+    test_set_union_construct();
+    test_set_union_construct_edge_cases();
+
+    test_set_intersection_construct();
+    test_set_intersection_construct_edge_cases();
+
+    test_set_difference_construct();
+    test_set_difference_construct_edge_cases();
+
+    test_set_symmetric_difference_construct();
+    test_set_symmetric_difference_construct_edge_cases();
+
+    bProcessed = true;
+#endif // _ENABLE_STD_RANGES_TESTING
+
+    return TestUtils::done(bProcessed);
+}
diff --git a/test/parallel_api/ranges/std_ranges_set_difference.pass.cpp b/test/parallel_api/ranges/std_ranges_set_difference.pass.cpp
index 46c90ffcf6f..ecce4779280 100644
--- a/test/parallel_api/ranges/std_ranges_set_difference.pass.cpp
+++ b/test/parallel_api/ranges/std_ranges_set_difference.pass.cpp
@@ -15,9 +15,25 @@
 
 #include "std_ranges_test.h"
 
-#if _ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
+#if _ENABLE_STD_RANGES_TESTING
 namespace test_std_ranges
 {
+// TODO remove after implementation range-based set operations for bounded output range with hetero policies
+template <>
+struct ResolveTestDataModeForHeteroPolicy<TestDataMode::data_in_out_lim>
+{
+    static constexpr bool RunTestForHeteroPolicy = true;
+    static constexpr TestDataMode res_mode = TestDataMode::data_in_out;
+};
+
+// TODO remove after implementation range-based set operations for bounded output range with hetero policies
+template <>
+struct ResolveTestDataModeForHeteroPolicy<TestDataMode::data_in_in_out_lim>
+{
+    static constexpr bool RunTestForHeteroPolicy = true;
+    static constexpr TestDataMode res_mode = TestDataMode::data_in_in_out;
+};
+
 template<>
 inline int out_size_with_empty_in2<std::remove_cvref_t<decltype(oneapi::dpl::ranges::set_difference)>>(int in1_size)
 {
@@ -37,14 +53,10 @@ void test_mixed_types_host()
     std::vector<int> out_unseq(out_expected.size(), 0xCD);
     std::vector<int> out_par_unseq(out_expected.size(), 0xCD);
 
-    oneapi::dpl::ranges::set_difference(
-        oneapi::dpl::execution::seq, r1, r2, out_seq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
-    oneapi::dpl::ranges::set_difference(
-        oneapi::dpl::execution::par, r1, r2, out_par, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
-    oneapi::dpl::ranges::set_difference(
-        oneapi::dpl::execution::unseq, r1, r2, out_unseq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
-    oneapi::dpl::ranges::set_difference(
-        oneapi::dpl::execution::par_unseq, r1, r2, out_par_unseq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_difference(oneapi::dpl::execution::seq,       r1, r2, out_seq,       std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_difference(oneapi::dpl::execution::par,       r1, r2, out_par,       std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_difference(oneapi::dpl::execution::unseq,     r1, r2, out_unseq,     std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_difference(oneapi::dpl::execution::par_unseq, r1, r2, out_par_unseq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
 
     EXPECT_EQ_RANGES(out_expected, out_seq, "wrong result with seq policy");
     EXPECT_EQ_RANGES(out_expected, out_par, "wrong result with par policy");
@@ -79,57 +91,201 @@ void test_mixed_types_device()
     }
 }
 #endif // TEST_DPCPP_BACKEND_PRESENT
-#endif // _ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
+
+#if ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT
+template <std::ranges::range _R1, std::ranges::range _R2, std::ranges::range _ROut>
+using set_difference_result_t =
+    std::ranges::in_out_result<std::ranges::borrowed_iterator_t<_R1>, std::ranges::borrowed_iterator_t<_ROut>>;
+#else
+template <std::ranges::range _R1, std::ranges::range _R2, std::ranges::range _ROut>
+using set_difference_result_t =
+    std::ranges::in_in_out_result<std::ranges::borrowed_iterator_t<_R1>, std::ranges::borrowed_iterator_t<_R2>,
+                                  std::ranges::borrowed_iterator_t<_ROut>>;
+#endif
+
+struct
+{
+    template <std::ranges::random_access_range _R1, std::ranges::random_access_range _R2,
+              std::ranges::random_access_range _ROut, typename Comp = std::ranges::less, typename Proj1 = std::identity,
+              typename Proj2 = std::identity>
+    set_difference_result_t<_R1, _R2, _ROut>
+    operator()(_R1&& r_1, _R2&& r_2, _ROut&& r_out, Comp comp = {}, Proj1 proj1 = {}, Proj2 proj2 = {})
+    {
+        auto in1 = std::ranges::begin(r_1);
+        auto in2 = std::ranges::begin(r_2);
+        auto out = std::ranges::begin(r_out);
+
+        const auto n1 = std::ranges::size(r_1);
+        const auto n2 = std::ranges::size(r_2);
+        const auto nOut = std::ranges::size(r_out);
+
+        std::size_t idx1 = 0;
+        std::size_t idx2 = 0;
+        std::size_t idxOut = 0;
+
+        while (idx1 < n1 && idx2 < n2)
+        {
+            if (std::invoke(comp, std::invoke(proj1, in1[idx1]), std::invoke(proj2, in2[idx2])))
+            {
+                if (idxOut < nOut)
+                    out[idxOut++] = in1[idx1++];
+                else 
+                    break;
+            }
+            else if (!std::invoke(comp, std::invoke(proj2, in2[idx2]), std::invoke(proj1, in1[idx1])))
+            {
+                ++idx1;
+                ++idx2;
+            }
+            else
+            {
+                ++idx2;
+            }
+        }
+
+        const auto remaining_space = nOut - idxOut;
+        const auto remaining_input = n1 - idx1;
+        const auto to_copy = std::min(remaining_space, remaining_input);
+        std::copy(in1 + idx1, in1 + idx1 + to_copy, out + idxOut);
+
+        idx1 += to_copy;
+        idxOut += to_copy;
+
+        assert(idx1 <= n1);
+        assert(idxOut <= nOut);
+
+#if ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT
+        return {in1 + idx1, out + idxOut};
+#else
+        return {in1 + idx1, in2 + idx2, out + idxOut};
+#endif
+    }
+} set_difference_checker;
+
+void
+test_set_difference_checker()
+{
+    // oneapi::dpl::ranges::set_difference logic
+    {
+        // set1:                   1, 2, 3, 4, 5,             10, 11, 12, 13, 14, 15
+        // set2:                   1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,                 20, 21, 22, 23, 24, 25
+        //                         -------------------------------------------------^---------------------------------------
+        // res:                                                                     |
+        // final position in set1: -------------------------------------------------+
+        // final position in set2:--------------------------------------------------+
+
+        std::vector<int> set1{1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15};
+        std::vector<int> set2{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25};
+        std::vector<int> set3(set1.size() + set2.size());
+        auto res = set_difference_checker(set1, set2, set3);
+#if ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT
+        EXPECT_EQ(res.in, set1.end(), "Wrong 'in' state of result");
+        EXPECT_EQ(res.out, set3.begin(), "Wrong 'out' state of result");
+#else
+        EXPECT_EQ(res.in1, set1.end(), "Wrong 'in1' state of result");
+        EXPECT_EQ(res.in2, set2.begin() + 15, "Wrong 'in2' state of result");
+        EXPECT_EQ(res.out, set3.begin(), "Wrong 'out' state of result");
+#    endif
+    }
+
+    // oneapi::dpl::ranges::set_difference logic
+    {
+        // set1:                   1, 2, 3, 4, 5,             10, 11, 12, 13, 14, 15
+        // set2:                   1, 2, 3, 4, 5, 6, 7, 8, 9,                                          20, 21, 22, 23, 24, 25
+        //                         --------------------------------------------------^---------------------------------------
+        // res:                                               10, 11, 12, 13, 14, 15 |
+        // final position in set1: --------------------------------------------------+
+        // final position in set2:---------------------------------------------------+
+
+        std::vector<int> set1{1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15};
+        std::vector<int> set2{1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25};
+        std::vector<int> set3(set1.size() + set2.size());
+        auto res = set_difference_checker(set1, set2, set3);
+#if ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT
+        EXPECT_EQ(res.in, set1.end(), "Wrong 'in' state of result");
+#else
+        EXPECT_EQ(res.in1, set1.end(), "Wrong 'in1' state of result");
+        EXPECT_EQ(res.in2, set2.begin() + 9, "Wrong 'in2' state of result");
+#endif
+
+        const std::vector<int> resExpected{10, 11, 12, 13, 14, 15};
+
+        EXPECT_EQ(res.out, set3.begin() + resExpected.size(), "Wrong 'out' state of result");
+
+        EXPECT_EQ_N(resExpected.begin(), set3.begin(), resExpected.size(), "Wrong output data state");
+    }
+
+    // oneapi::dpl::ranges::set_difference logic
+    {
+        // set1:                   1, 2, 3, 4, 5, 6, 7, 8, 9, 10,                 15, 16, 17, 18, 19, 20
+        // set2:                            4, 5, 6, 7,           11, 12, 13, 15, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
+        //                         ---------------------------------------------------------------------^-------------------
+        // res:                    1, 2, 3,             8, 9, 10                                        |
+        // final position in set1: ---------------------------------------------------------------------+
+        // final position in set2:----------------------------------------------------------------------+
+
+        std::vector<int> set1{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 19, 20};
+        std::vector<int> set2{4, 5, 6, 7, 11, 12, 13, 15, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25};
+        std::vector<int> set3(set1.size() + set2.size());
+        auto res = set_difference_checker(set1, set2, set3);
+#if ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT
+        EXPECT_EQ(res.in, set1.end(), "Wrong 'in' state of result");
+#else                                                               
+        EXPECT_EQ(res.in1, set1.end(), "Wrong 'in1' state of result");
+        EXPECT_EQ(res.in2, set2.begin() + 14, "Wrong 'in2' state of result");
+#endif
+
+        const std::vector<int> resExpected{1, 2, 3, 8, 9, 10};
+
+        EXPECT_EQ(res.out, set3.begin() + resExpected.size(), "Wrong 'out' state of result");
+
+        EXPECT_EQ_N(resExpected.begin(), set3.begin(), resExpected.size(), "Wrong output data state");
+    }
+}
+#endif // _ENABLE_STD_RANGES_TESTING
 
 int
 main()
 {
     bool bProcessed = false;
 
-#if _ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
-    using namespace test_std_ranges;
-    namespace dpl_ranges = oneapi::dpl::ranges;
-
-    // TODO: use data_in_in_out_lim when set_difference supports
-    // output range not-sufficiently large to hold all the processed elements
-    // this will also require adding a custom serial implementation of the algorithm into the checker
+#if _ENABLE_STD_RANGES_TESTING
 
-    auto checker = [](std::ranges::random_access_range auto&& r1,
-                      std::ranges::random_access_range auto&& r2,
-                      std::ranges::random_access_range auto&& r_out, auto&&... args)
-    {
-        auto res = std::ranges::set_difference(std::forward<decltype(r1)>(r1), std::forward<decltype(r2)>(r2),
-                                               std::ranges::begin(r_out), std::forward<decltype(args)>(args)...);
+    // Check the correctness of the set_difference_checker against the logic of std::ranges::set_difference
+    test_set_difference_checker();
 
-        using ret_type = std::ranges::set_difference_result<std::ranges::borrowed_iterator_t<decltype(r1)>,
-                                                            std::ranges::borrowed_iterator_t<decltype(r_out)>>;
-        return ret_type{res.in, res.out};
-    };
+    using namespace test_std_ranges;
+    namespace dpl_ranges = oneapi::dpl::ranges;
 
-    test_range_algo<0, int, data_in_in_out, div3_t, mul1_t>{big_sz}(dpl_ranges::set_difference, checker);
-    test_range_algo<1, int, data_in_in_out, div3_t, mul1_t>{big_sz}(dpl_ranges::set_difference, checker, std::ranges::less{}, proj);
+    test_range_algo<0, int, data_in_in_out_lim, div3_t, mul1_t>{big_sz}(dpl_ranges::set_difference, set_difference_checker);
+    test_range_algo<1, int, data_in_in_out_lim, div3_t, mul1_t>{big_sz}(dpl_ranges::set_difference, set_difference_checker, std::ranges::less{}, proj);
 
     // Testing the cut-off with the serial implementation (less than __set_algo_cut_off)
-    test_range_algo<2, int, data_in_in_out, div3_t, mul1_t>{100}(dpl_ranges::set_difference, checker, std::ranges::less{}, proj, proj);
+    test_range_algo<2, int, data_in_in_out_lim, div3_t, mul1_t>{100}(dpl_ranges::set_difference, set_difference_checker, std::ranges::less{}, proj, proj);
 
-    test_range_algo<3,  P2, data_in_in_out, div3_t, mul1_t>{}(dpl_ranges::set_difference, checker, std::ranges::less{}, &P2::x, &P2::x);
-    test_range_algo<4,  P2, data_in_in_out, div3_t, mul1_t>{}(dpl_ranges::set_difference, checker, std::ranges::less{}, &P2::proj, &P2::proj);
+    test_range_algo<3,  P2, data_in_in_out_lim, div3_t, mul1_t>{}(dpl_ranges::set_difference, set_difference_checker, std::ranges::less{}, &P2::x, &P2::x);
+    test_range_algo<4,  P2, data_in_in_out_lim, div3_t, mul1_t>{}(dpl_ranges::set_difference, set_difference_checker, std::ranges::less{}, &P2::proj, &P2::proj);
 
     // Testing no intersection
     auto large_shift = [](auto&& v) { return v + 5000; };
     using ls_t = decltype(large_shift);
-    test_range_algo<5, int, data_in_in_out, mul1_t, ls_t>{1000}(dpl_ranges::set_difference, checker);
-    test_range_algo<6, int, data_in_in_out, ls_t, mul1_t>{1000}(dpl_ranges::set_difference, checker);
+    test_range_algo<5, int, data_in_in_out_lim,  mul1_t, ls_t>{1000}(dpl_ranges::set_difference, set_difference_checker);
+    test_range_algo<6, int, data_in_in_out_lim, ls_t, mul1_t>{1000}(dpl_ranges::set_difference, set_difference_checker);
 
     // Check if projections are applied to the right sequences and trigger a compile-time error if not
     test_mixed_types_host();
 #if TEST_DPCPP_BACKEND_PRESENT
+
+// TODO remove the definition check after implementation range-based set operations for bounded output range with hetero policies
+#if ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT
     test_mixed_types_device();
+#endif
+
 #endif
 
     bProcessed = true;
 
-#endif //_ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
+#endif //_ENABLE_STD_RANGES_TESTING
 
     return TestUtils::done(bProcessed);
 }
diff --git a/test/parallel_api/ranges/std_ranges_set_intersection.pass.cpp b/test/parallel_api/ranges/std_ranges_set_intersection.pass.cpp
index 892a1aadd9d..79aa171ee0f 100644
--- a/test/parallel_api/ranges/std_ranges_set_intersection.pass.cpp
+++ b/test/parallel_api/ranges/std_ranges_set_intersection.pass.cpp
@@ -15,7 +15,33 @@
 
 #include "std_ranges_test.h"
 
-#if _ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
+#if _ENABLE_STD_RANGES_TESTING
+namespace test_std_ranges
+{
+// TODO remove after implementation range-based set operations for bounded output range with hetero policies
+template <>
+struct ResolveTestDataModeForHeteroPolicy<TestDataMode::data_in_out_lim>
+{
+#if STD_RANGES_SET_INTERSECTION_BROKEN_FOR_HETERO_POLICY
+    static constexpr bool RunTestForHeteroPolicy = false;
+#else
+    static constexpr bool RunTestForHeteroPolicy = true;
+#endif
+    static constexpr TestDataMode res_mode = TestDataMode::data_in_out;
+};
+
+// TODO remove after implementation range-based set operations for bounded output range with hetero policies
+template <>
+struct ResolveTestDataModeForHeteroPolicy<TestDataMode::data_in_in_out_lim>
+{
+#if STD_RANGES_SET_INTERSECTION_BROKEN_FOR_HETERO_POLICY
+    static constexpr bool RunTestForHeteroPolicy = false;
+#else
+    static constexpr bool RunTestForHeteroPolicy = true;
+#endif
+    static constexpr TestDataMode res_mode = TestDataMode::data_in_in_out;
+};
+} // namespace test_std_ranges
 
 void test_mixed_types_host()
 {
@@ -29,14 +55,10 @@ void test_mixed_types_host()
     std::vector<int> out_unseq(out_expected.size(), 0xCD);
     std::vector<int> out_par_unseq(out_expected.size(), 0xCD);
 
-    oneapi::dpl::ranges::set_intersection(
-        oneapi::dpl::execution::seq, r1, r2, out_seq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
-    oneapi::dpl::ranges::set_intersection(
-        oneapi::dpl::execution::par, r1, r2, out_par, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
-    oneapi::dpl::ranges::set_intersection(
-        oneapi::dpl::execution::unseq, r1, r2, out_unseq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
-    oneapi::dpl::ranges::set_intersection(
-        oneapi::dpl::execution::par_unseq, r1, r2, out_par_unseq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_intersection(oneapi::dpl::execution::seq,       r1, r2, out_seq,       std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_intersection(oneapi::dpl::execution::par,       r1, r2, out_par,       std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_intersection(oneapi::dpl::execution::unseq,     r1, r2, out_unseq,     std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_intersection(oneapi::dpl::execution::par_unseq, r1, r2, out_par_unseq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
 
     EXPECT_EQ_RANGES(out_expected, out_seq, "wrong result with seq policy");
     EXPECT_EQ_RANGES(out_expected, out_par, "wrong result with par policy");
@@ -71,49 +93,136 @@ void test_mixed_types_device()
     }
 }
 #endif // TEST_DPCPP_BACKEND_PRESENT
-#endif // _ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
+
+struct
+{
+    template <std::ranges::random_access_range _R1, std::ranges::random_access_range _R2,
+              std::ranges::random_access_range _ROut, typename Comp = std::ranges::less, typename Proj1 = std::identity,
+              typename Proj2 = std::identity>
+    std::ranges::set_intersection_result<std::ranges::borrowed_iterator_t<_R1>, std::ranges::borrowed_iterator_t<_R2>,
+                                         std::ranges::borrowed_iterator_t<_ROut>>
+    operator()(_R1&& r_1, _R2&& r_2, _ROut&& r_out, Comp comp = {}, Proj1 proj1 = {}, Proj2 proj2 = {})
+    {
+        auto in1 = std::ranges::begin(r_1);
+        auto in2 = std::ranges::begin(r_2);
+        auto out = std::ranges::begin(r_out);
+
+        const auto n1 = std::ranges::size(r_1);
+        const auto n2 = std::ranges::size(r_2);
+        const auto nOut = std::ranges::size(r_out);
+
+        std::size_t idx1 = 0;
+        std::size_t idx2 = 0;
+        std::size_t idxOut = 0;
+
+        while (idx1 < n1 && idx2 < n2)
+        {
+            if (std::invoke(comp, std::invoke(proj1, in1[idx1]), std::invoke(proj2, in2[idx2])))
+            {
+                ++idx1;
+            }
+            else if (std::invoke(comp, std::invoke(proj2, in2[idx2]), std::invoke(proj1, in1[idx1])))
+            {
+                ++idx2;
+            }
+            else if (idxOut < nOut)
+            {
+                out[idxOut++] = in1[idx1++];
+                ++idx2;
+            }
+            else
+            {
+                break;
+            }
+        }
+
+        return {in1 + idx1, in2 + idx2, out + idxOut};
+    }
+} set_intersection_checker;
+
+void
+test_set_intersection_checker()
+{
+    // oneapi::dpl::ranges::set_intersection logic
+    {
+        // set1:                   1, 2, 3, 4, 5,             10, 11, 12, 13, 14, 15
+        // set2:                   1, 2, 3, 4, 5, 6, 7, 8, 9,                                         20, 21, 22, 23, 24, 25
+        //                         -------------------------------------------------^---------------------------------------
+        // res:                    1, 2, 3, 4, 5                                    |
+        // final position in set1: -------------------------------------------------+
+        // final position in set2:--------------------------------------------------+
+
+        std::vector<int> set1{1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15};
+        std::vector<int> set2{1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25};
+        std::vector<int> set3(set1.size() + set2.size());
+        auto res = set_intersection_checker(set1, set2, set3);
+        EXPECT_EQ(res.in1, set1.end(), "Wrong 'in1' state of result");
+        EXPECT_EQ(res.in2, std::find(set2.begin(), set2.end(), 20), "Wrong 'in2' state of result");
+
+        const std::vector<int> resExpected{1, 2, 3, 4, 5};
+
+        EXPECT_EQ(res.out, set3.begin() + resExpected.size(), "Wrong 'out' state of result");
+
+        EXPECT_EQ_N(resExpected.begin(), set3.begin(), resExpected.size(), "Wrong output data state");
+    }
+
+    // oneapi::dpl::ranges::set_intersection logic
+    {
+        // set1:                   1, 2, 3, 4, 5, 6, 7, 8, 9, 10,                 15, 16, 17, 18, 19, 20
+        // set2:                            4, 5, 6, 7,           11, 12, 13, 15, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
+        //                         ---------------------------------------------------------------------^-------------------
+        // res:                             4, 5, 6, 7,                           15, 16, 17, 18, 19, 20|
+        // final position in set1: ---------------------------------------------------------------------+
+        // final position in set2:----------------------------------------------------------------------+
+
+        std::vector<int> set1{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 19, 20};
+        std::vector<int> set2{4, 5, 6, 7, 11, 12, 13, 15, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25};
+        std::vector<int> set3(set1.size() + set2.size());
+        auto res = set_intersection_checker(set1, set2, set3);
+        EXPECT_EQ(res.in1, set1.end(), "Wrong 'in1' state of result");
+        EXPECT_EQ(res.in2, std::find(set2.begin(), set2.end(), 21), "Wrong 'in2' state of result");
+
+        const std::vector<int> resExpected{4, 5, 6, 7, 15, 16, 17, 18, 19, 20};
+
+        EXPECT_EQ(res.out, set3.begin() + resExpected.size(), "Wrong 'out' state of result");
+
+        EXPECT_EQ_N(resExpected.begin(), set3.begin(), resExpected.size(), "Wrong output data state");
+    }
+}
+#endif // _ENABLE_STD_RANGES_TESTING
 
 int
 main()
 {
     bool bProcessed = false;
 
-#if _ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
-    using namespace test_std_ranges;
-    namespace dpl_ranges = oneapi::dpl::ranges;
+#if _ENABLE_STD_RANGES_TESTING
 
-    auto set_intersection_checker = [](std::ranges::random_access_range auto&& r1,
-                                       std::ranges::random_access_range auto&& r2,
-                                       std::ranges::random_access_range auto&& r_out, auto&&... args)
-    {
-        auto res = std::ranges::set_intersection(std::forward<decltype(r1)>(r1), std::forward<decltype(r2)>(r2),
-                                                 std::ranges::begin(r_out), std::forward<decltype(args)>(args)...);
+    // Check the correctness of the set_intersection_checker against the logic of std::ranges::set_intersection
+    test_set_intersection_checker();
 
-        using ret_type = std::ranges::set_intersection_result<std::ranges::borrowed_iterator_t<decltype(r1)>,
-                                                              std::ranges::borrowed_iterator_t<decltype(r2)>,
-                                                              std::ranges::borrowed_iterator_t<decltype(r_out)>>;
-        return ret_type{res.in1, res.in2, res.out};
-    };
+    using namespace test_std_ranges;
+    namespace dpl_ranges = oneapi::dpl::ranges;
 
-    test_range_algo<0, int, data_in_in_out, mul1_t, div3_t>{big_sz}(dpl_ranges::set_intersection, set_intersection_checker);
-    test_range_algo<1, int, data_in_in_out, mul1_t, div3_t>{big_sz}(dpl_ranges::set_intersection, set_intersection_checker, std::ranges::less{}, proj);
+    test_range_algo<0, int, data_in_in_out_lim, mul1_t, div3_t>{big_sz}(dpl_ranges::set_intersection, set_intersection_checker);
+    test_range_algo<1, int, data_in_in_out_lim, mul1_t, div3_t>{big_sz}(dpl_ranges::set_intersection, set_intersection_checker, std::ranges::less{}, proj);
 
     // Testing the cut-off with the serial implementation (less than __set_algo_cut_off)
-    test_range_algo<2, int, data_in_in_out, mul1_t, div3_t>{100}(dpl_ranges::set_intersection, set_intersection_checker, std::ranges::less{}, proj, proj);
+    test_range_algo<2, int, data_in_in_out_lim, mul1_t, div3_t>{100}(dpl_ranges::set_intersection, set_intersection_checker, std::ranges::less{}, proj, proj);
 
-    test_range_algo<3,  P2, data_in_in_out, mul1_t, div3_t>{}(dpl_ranges::set_intersection, set_intersection_checker, std::ranges::less{}, &P2::x, &P2::x);
-    test_range_algo<4,  P2, data_in_in_out, mul1_t, div3_t>{}(dpl_ranges::set_intersection, set_intersection_checker, std::ranges::less{}, &P2::proj, &P2::proj);
+    test_range_algo<3,  P2, data_in_in_out_lim, mul1_t, div3_t>{}(dpl_ranges::set_intersection, set_intersection_checker, std::ranges::less{}, &P2::x, &P2::x);
+    test_range_algo<4,  P2, data_in_in_out_lim, mul1_t, div3_t>{}(dpl_ranges::set_intersection, set_intersection_checker, std::ranges::less{}, &P2::proj, &P2::proj);
 
     // Testing partial intersection less than __set_algo_cut_off
     auto medium_shift = [](auto&& v) { return v + 400; };
     using ms_t = decltype(medium_shift);
-    test_range_algo<5, int, data_in_in_out, mul1_t, ms_t>{600}(dpl_ranges::set_intersection, set_intersection_checker);
+    test_range_algo<5, int, data_in_in_out_lim, mul1_t, ms_t>{600}(dpl_ranges::set_intersection, set_intersection_checker);
 
     // Testing no intersection
     auto large_shift = [](auto&& v) { return v + 5000; };
     using ls_t = decltype(large_shift);
-    test_range_algo<6, int, data_in_in_out, mul1_t, ls_t>{1000}(dpl_ranges::set_intersection, set_intersection_checker);
-    test_range_algo<7, int, data_in_in_out, ls_t, mul1_t>{1000}(dpl_ranges::set_intersection, set_intersection_checker);
+    test_range_algo<6, int, data_in_in_out_lim, mul1_t, ls_t>{1000}(dpl_ranges::set_intersection, set_intersection_checker);
+    test_range_algo<7, int, data_in_in_out_lim, ls_t, mul1_t>{1000}(dpl_ranges::set_intersection, set_intersection_checker);
 
     // Check if projections are applied to the right sequences and trigger a compile-time error if not
     test_mixed_types_host();
@@ -123,7 +232,7 @@ main()
 
     bProcessed = true;
 
-#endif //_ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
+#endif //_ENABLE_STD_RANGES_TESTING
 
     return TestUtils::done(bProcessed);
 }
diff --git a/test/parallel_api/ranges/std_ranges_set_symmetric_difference.pass.cpp b/test/parallel_api/ranges/std_ranges_set_symmetric_difference.pass.cpp
index deac50ddfb5..f7048e95778 100644
--- a/test/parallel_api/ranges/std_ranges_set_symmetric_difference.pass.cpp
+++ b/test/parallel_api/ranges/std_ranges_set_symmetric_difference.pass.cpp
@@ -15,9 +15,25 @@
 
 #include "std_ranges_test.h"
 
-#if _ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
+#if _ENABLE_STD_RANGES_TESTING
 namespace test_std_ranges
 {
+// TODO remove after implementation range-based set operations for bounded output range with hetero policies
+template <>
+struct ResolveTestDataModeForHeteroPolicy<TestDataMode::data_in_out_lim>
+{
+    static constexpr bool RunTestForHeteroPolicy = true;
+    static constexpr TestDataMode res_mode = TestDataMode::data_in_out;
+};
+
+// TODO remove after implementation range-based set operations for bounded output range with hetero policies
+template <>
+struct ResolveTestDataModeForHeteroPolicy<TestDataMode::data_in_in_out_lim>
+{
+    static constexpr bool RunTestForHeteroPolicy = true;
+    static constexpr TestDataMode res_mode = TestDataMode::data_in_in_out;
+};
+
 template<>
 int out_size_with_empty_in2<std::remove_cvref_t<decltype(oneapi::dpl::ranges::set_symmetric_difference)>>(int in1_size)
 {
@@ -42,14 +58,10 @@ void test_mixed_types_host()
     std::vector<int> out_unseq(out_expected.size(), 0xCD);
     std::vector<int> out_par_unseq(out_expected.size(), 0xCD);
 
-    oneapi::dpl::ranges::set_symmetric_difference(
-        oneapi::dpl::execution::seq, r1, r2, out_seq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
-    oneapi::dpl::ranges::set_symmetric_difference(
-        oneapi::dpl::execution::par, r1, r2, out_par, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
-    oneapi::dpl::ranges::set_symmetric_difference(
-        oneapi::dpl::execution::unseq, r1, r2, out_unseq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
-    oneapi::dpl::ranges::set_symmetric_difference(
-        oneapi::dpl::execution::par_unseq, r1, r2, out_par_unseq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_symmetric_difference(oneapi::dpl::execution::seq,       r1, r2, out_seq,       std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_symmetric_difference(oneapi::dpl::execution::par,       r1, r2, out_par,       std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_symmetric_difference(oneapi::dpl::execution::unseq,     r1, r2, out_unseq,     std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_symmetric_difference(oneapi::dpl::execution::par_unseq, r1, r2, out_par_unseq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
 
     EXPECT_EQ_RANGES(out_expected, out_seq, "wrong result with seq policy");
     EXPECT_EQ_RANGES(out_expected, out_par, "wrong result with par policy");
@@ -84,42 +96,95 @@ void test_mixed_types_device()
     }
 }
 #endif // TEST_DPCPP_BACKEND_PRESENT
-#endif // _ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
+
+struct
+{
+    template <std::ranges::random_access_range _R1, std::ranges::random_access_range _R2,
+              std::ranges::random_access_range _ROut, typename Comp = std::ranges::less, typename Proj1 = std::identity,
+              typename Proj2 = std::identity>
+    std::ranges::set_symmetric_difference_result<std::ranges::borrowed_iterator_t<_R1>,
+                                                 std::ranges::borrowed_iterator_t<_R2>,
+                                                 std::ranges::borrowed_iterator_t<_ROut>>
+    operator()(_R1&& r_1, _R2&& r_2, _ROut&& r_out, Comp comp = {}, Proj1 proj1 = {}, Proj2 proj2 = {})
+    {
+        auto in1 = std::ranges::begin(r_1);
+        auto in2 = std::ranges::begin(r_2);
+        auto out = std::ranges::begin(r_out);
+
+        const auto n1 = std::ranges::size(r_1);
+        const auto n2 = std::ranges::size(r_2);
+        const auto nOut = std::ranges::size(r_out);
+
+        std::size_t idx1 = 0;
+        std::size_t idx2 = 0;
+        std::size_t idxOut = 0;
+
+        while (idx1 < n1)
+        {
+            if (idx2 == n2)
+            {
+                auto remaining_space = std::ranges::size(r_out) - idxOut;
+                auto remaining_input = n1 - idx1;
+                auto to_copy = std::min(remaining_space, remaining_input);
+                std::copy(in1 + idx1, in1 + idx1 + to_copy, out + idxOut);
+
+                idx1 += to_copy;
+                idxOut += to_copy;
+                break;
+            }
+
+            if (std::invoke(comp, std::invoke(proj1, in1[idx1]), std::invoke(proj2, in2[idx2])))
+            {
+                if (idxOut < nOut)
+                    out[idxOut++] = in1[idx1++];
+                else
+                    break;
+            }
+            else
+            {
+                if (std::invoke(comp, std::invoke(proj2, in2[idx2]), std::invoke(proj1, in1[idx1])))
+                {
+                    if (idxOut < nOut)
+                        out[idxOut++] = in2[idx2];
+                    else
+                        break;
+                }
+                else
+                    ++idx1;
+                ++idx2;
+            }
+        }
+
+        const auto remaining_space = nOut - idxOut;
+        const auto remaining_input = n2 - idx2;
+        const auto to_copy = std::min(remaining_space, remaining_input);
+        std::copy(in2 + idx2, in2 + idx2 + to_copy, out + idxOut);
+
+        idx2 += to_copy;
+        idxOut += to_copy;
+
+        return {in1 + idx1, in2 + idx2, out + idxOut};
+    }
+} set_symmetric_difference_checker;
+#endif // _ENABLE_STD_RANGES_TESTING
 
 int
 main()
 {
     bool bProcessed = false;
 
-#if _ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
+#if _ENABLE_STD_RANGES_TESTING
     using namespace test_std_ranges;
     namespace dpl_ranges = oneapi::dpl::ranges;
 
-    // TODO: use data_in_in_out_lim when set_symmetric_difference supports
-    // output range not-sufficiently large to hold all the processed elements
-    // this will also require adding a custom serial implementation of the algorithm into the checker
-
-    auto checker = [](std::ranges::random_access_range auto&& r1,
-                      std::ranges::random_access_range auto&& r2,
-                      std::ranges::random_access_range auto&& r_out, auto&&... args)
-    {
-        auto res = std::ranges::set_symmetric_difference(std::forward<decltype(r1)>(r1), std::forward<decltype(r2)>(r2),
-                                                         std::ranges::begin(r_out), std::forward<decltype(args)>(args)...);
-
-        using ret_type = std::ranges::set_symmetric_difference_result<std::ranges::borrowed_iterator_t<decltype(r1)>,
-                                                                      std::ranges::borrowed_iterator_t<decltype(r2)>,
-                                                                      std::ranges::borrowed_iterator_t<decltype(r_out)>>;
-        return ret_type{res.in1, res.in2, res.out};
-    };
-
-    test_range_algo<0, int, data_in_in_out, div3_t, mul1_t>{big_sz}(dpl_ranges::set_symmetric_difference, checker);
-    test_range_algo<1, int, data_in_in_out, mul1_t, div3_t>{big_sz}(dpl_ranges::set_symmetric_difference, checker, std::ranges::less{}, proj);
+    test_range_algo<0, int, data_in_in_out_lim, div3_t, mul1_t>{big_sz}(dpl_ranges::set_symmetric_difference, set_symmetric_difference_checker);
+    test_range_algo<1, int, data_in_in_out_lim, mul1_t, div3_t>{big_sz}(dpl_ranges::set_symmetric_difference, set_symmetric_difference_checker, std::ranges::less{}, proj);
 
     // Testing the cut-off with the serial implementation (less than __set_algo_cut_off)
-    test_range_algo<2, int, data_in_in_out, mul1_t, mul1_t>{100}(dpl_ranges::set_symmetric_difference, checker, std::ranges::less{}, proj, proj);
+    test_range_algo<2, int, data_in_in_out_lim, mul1_t, mul1_t>{100}(dpl_ranges::set_symmetric_difference, set_symmetric_difference_checker, std::ranges::less{}, proj, proj);
 
-    test_range_algo<3,  P2, data_in_in_out, mul1_t, div3_t>{}(dpl_ranges::set_symmetric_difference, checker, std::ranges::less{}, &P2::x, &P2::x);
-    test_range_algo<4,  P2, data_in_in_out, mul1_t, div3_t>{}(dpl_ranges::set_symmetric_difference, checker, std::ranges::less{}, &P2::proj, &P2::proj);
+    test_range_algo<3,  P2, data_in_in_out_lim, mul1_t, div3_t>{}(dpl_ranges::set_symmetric_difference, set_symmetric_difference_checker, std::ranges::less{}, &P2::x, &P2::x);
+    test_range_algo<4,  P2, data_in_in_out_lim, mul1_t, div3_t>{}(dpl_ranges::set_symmetric_difference, set_symmetric_difference_checker, std::ranges::less{}, &P2::proj, &P2::proj);
 
     // Check if projections are applied to the right sequences and trigger a compile-time error if not
     test_mixed_types_host();
@@ -129,7 +194,7 @@ main()
 
     bProcessed = true;
 
-#endif //_ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
+#endif //_ENABLE_STD_RANGES_TESTING
 
     return TestUtils::done(bProcessed);
 }
diff --git a/test/parallel_api/ranges/std_ranges_set_union.pass.cpp b/test/parallel_api/ranges/std_ranges_set_union.pass.cpp
index b5e0e0e5c1c..e1127dd36e3 100644
--- a/test/parallel_api/ranges/std_ranges_set_union.pass.cpp
+++ b/test/parallel_api/ranges/std_ranges_set_union.pass.cpp
@@ -15,9 +15,25 @@
 
 #include "std_ranges_test.h"
 
-#if _ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
+#if _ENABLE_STD_RANGES_TESTING
 namespace test_std_ranges
 {
+// TODO remove after implementation range-based set operations for bounded output range with hetero policies
+template <>
+struct ResolveTestDataModeForHeteroPolicy<TestDataMode::data_in_out_lim>
+{
+    static constexpr bool RunTestForHeteroPolicy = true;
+    static constexpr TestDataMode res_mode = TestDataMode::data_in_out;
+};
+
+// TODO remove after implementation range-based set operations for bounded output range with hetero policies
+template <>
+struct ResolveTestDataModeForHeteroPolicy<TestDataMode::data_in_in_out_lim>
+{
+    static constexpr bool RunTestForHeteroPolicy = true;
+    static constexpr TestDataMode res_mode = TestDataMode::data_in_in_out;
+};
+
 template<>
 inline int out_size_with_empty_in2<std::remove_cvref_t<decltype(oneapi::dpl::ranges::set_union)>>(int in1_size)
 {
@@ -42,14 +58,10 @@ void test_mixed_types_host()
     std::vector<int> out_unseq(5, 0xCD);
     std::vector<int> out_par_unseq(5, 0xCD);
 
-    oneapi::dpl::ranges::set_union(
-        oneapi::dpl::execution::seq, r1, r2, out_seq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
-    oneapi::dpl::ranges::set_union(
-        oneapi::dpl::execution::par, r1, r2, out_par, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
-    oneapi::dpl::ranges::set_union(
-        oneapi::dpl::execution::unseq, r1, r2, out_unseq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
-    oneapi::dpl::ranges::set_union(
-        oneapi::dpl::execution::par_unseq, r1, r2, out_par_unseq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_union(oneapi::dpl::execution::seq,       r1, r2, out_seq,       std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_union(oneapi::dpl::execution::par,       r1, r2, out_par,       std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_union(oneapi::dpl::execution::unseq,     r1, r2, out_unseq,     std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
+    oneapi::dpl::ranges::set_union(oneapi::dpl::execution::par_unseq, r1, r2, out_par_unseq, std::ranges::less{}, test_std_ranges::proj_a, test_std_ranges::proj_b);
 
     EXPECT_EQ_RANGES(out_expected, out_seq, "wrong result with seq policy");
     EXPECT_EQ_RANGES(out_expected, out_par, "wrong result with par policy");
@@ -83,38 +95,95 @@ void test_mixed_types_device()
     }
 }
 #endif // TEST_DPCPP_BACKEND_PRESENT
-#endif // _ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
+
+struct
+{
+    template <std::ranges::random_access_range _R1, std::ranges::random_access_range _R2,
+              std::ranges::random_access_range _ROut, typename Comp = std::ranges::less, typename Proj1 = std::identity,
+              typename Proj2 = std::identity>
+    std::ranges::set_union_result<std::ranges::borrowed_iterator_t<_R1>,
+                                  std::ranges::borrowed_iterator_t<_R2>,
+                                  std::ranges::borrowed_iterator_t<_ROut>>
+    operator()(_R1&& r_1, _R2&& r_2, _ROut&& r_out, Comp comp = {}, Proj1 proj1 = {}, Proj2 proj2 = {})
+    {
+        // r_1 : 0, 1, 2, 3, ..., 131081
+        // r_2 : 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, ..., 21845, 21845, 21845, 21846, 21846, 21846
+        // r_out : size = 131082
+        //     : 0, 0, 0, 1, 1, 1, 2, 2, 2, ...., 87385, 87386, 87387
+        auto in1 = std::ranges::begin(r_1);
+        auto in2 = std::ranges::begin(r_2);
+        auto out = std::ranges::begin(r_out);
+
+        const auto n1 = std::ranges::size(r_1);
+        const auto n2 = std::ranges::size(r_2);
+        const auto nOut = std::ranges::size(r_out);
+
+        std::size_t idx1 = 0;
+        std::size_t idx2 = 0;
+        std::size_t idxOut = 0;
+
+        while (idx1 < n1 && idx2 < n2 && idxOut < nOut)
+        {
+            if (std::invoke(comp, std::invoke(proj1, in1[idx1]), std::invoke(proj2, in2[idx2])))
+            {
+                out[idxOut++] = in1[idx1++];
+            }
+            else if (std::invoke(comp, std::invoke(proj2, in2[idx2]), std::invoke(proj1, in1[idx1])))
+            {
+                out[idxOut++] = in2[idx2++];
+            }
+            else
+            {
+                out[idxOut++] = in1[idx1++];
+                ++idx2;
+            }
+        }
+
+        if (idx1 < n1)
+        {
+            const auto remaining_space = nOut - idxOut;
+            const auto remaining_input = n1 - idx1;
+            const auto to_copy = std::min(remaining_space, remaining_input);
+            std::copy(in1 + idx1, in1 + idx1 + to_copy, out + idxOut);
+
+            idx1 += to_copy;
+            idxOut += to_copy;
+        }
+
+        if (idx2 < n2)
+        {
+            const auto remaining_space = nOut - idxOut;
+            const auto remaining_input = n2 - idx2;
+            const auto to_copy = std::min(remaining_space, remaining_input);
+            std::copy(in2 + idx2, in2 + idx2 + to_copy, out + idxOut);
+
+            idx2 += to_copy;
+            idxOut += to_copy;
+        }
+
+        return {in1 + idx1, in2 + idx2, out + idxOut};
+    }
+} set_union_checker;
+
+#endif // _ENABLE_STD_RANGES_TESTING
 
 int
 main()
 {
     bool bProcessed = false;
 
-#if _ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
+#if _ENABLE_STD_RANGES_TESTING
     using namespace test_std_ranges;
     namespace dpl_ranges = oneapi::dpl::ranges;
 
-    auto set_union_checker = [](std::ranges::random_access_range auto&& r1,
-                                std::ranges::random_access_range auto&& r2,
-                                std::ranges::random_access_range auto&& r_out, auto&&... args)
-    {
-        auto res = std::ranges::set_union(std::forward<decltype(r1)>(r1), std::forward<decltype(r2)>(r2),
-                                          std::ranges::begin(r_out), std::forward<decltype(args)>(args)...);
-
-        using ret_type = std::ranges::set_union_result<std::ranges::borrowed_iterator_t<decltype(r1)>,
-                                                       std::ranges::borrowed_iterator_t<decltype(r2)>,
-                                                       std::ranges::borrowed_iterator_t<decltype(r_out)>>;
-        return ret_type{res.in1, res.in2, res.out};
-    };
-
-    test_range_algo<0, int, data_in_in_out, mul1_t, div3_t>{big_sz}(dpl_ranges::set_union, set_union_checker);
-    test_range_algo<1, int, data_in_in_out, mul1_t, div3_t>{big_sz}(dpl_ranges::set_union, set_union_checker, std::ranges::less{}, proj);
+    test_range_algo<0, int, data_in_in_out_lim, mul1_t, div3_t>{big_sz}(dpl_ranges::set_union, set_union_checker);
+    test_range_algo<1, int, data_in_in_out_lim, mul1_t, div3_t>{big_sz}(dpl_ranges::set_union, set_union_checker, std::ranges::less{}, proj);
 
     // Testing the cut-off with the serial implementation (less than __set_algo_cut_off)
-    test_range_algo<2, int, data_in_in_out, mul1_t, div3_t>{100}(dpl_ranges::set_union, set_union_checker, std::ranges::less{}, proj, proj);
+    test_range_algo<2, int, data_in_in_out_lim, mul1_t, div3_t>{100}(dpl_ranges::set_union, set_union_checker, std::ranges::less{}, proj, proj);
 
-    test_range_algo<3,  P2, data_in_in_out, mul1_t, div3_t>{}(dpl_ranges::set_union, set_union_checker, std::ranges::less{}, &P2::x, &P2::x);
-    test_range_algo<4,  P2, data_in_in_out, mul1_t, div3_t>{}(dpl_ranges::set_union, set_union_checker, std::ranges::less{}, &P2::proj, &P2::proj);
+    test_range_algo<3,  P2, data_in_in_out_lim, mul1_t, div3_t>{}(dpl_ranges::set_union, set_union_checker, std::ranges::less{}, &P2::x, &P2::x);
+    test_range_algo<4,  P2, data_in_in_out_lim, mul1_t, div3_t>{}(dpl_ranges::set_union, set_union_checker, std::ranges::less{}, &P2::proj, &P2::proj);
 
     test_mixed_types_host();
 #if TEST_DPCPP_BACKEND_PRESENT
@@ -122,8 +191,7 @@ main()
 #endif // TEST_DPCPP_BACKEND_PRESENT
 
     bProcessed = true;
-
-#endif //_ENABLE_STD_RANGES_TESTING && !_PSTL_LIBCPP_RANGE_SET_BROKEN
+#endif //_ENABLE_STD_RANGES_TESTING
 
     return TestUtils::done(bProcessed);
 }
diff --git a/test/parallel_api/ranges/std_ranges_test.h b/test/parallel_api/ranges/std_ranges_test.h
index 63dcf96dd58..b6ce2bb8ef9 100644
--- a/test/parallel_api/ranges/std_ranges_test.h
+++ b/test/parallel_api/ranges/std_ranges_test.h
@@ -109,6 +109,14 @@ struct P2
 
     int proj() const { return x; }
     friend bool operator==(const P2& a, const P2& b) { return a.x == b.x && a.y == b.y; }
+
+    template <typename OStream>
+    friend OStream&
+    operator<<(OStream& os, const P2& item)
+    {
+        os << "{" << item.x << ", " << item.y << "}";
+        return os;
+    }
 };
 
 struct P3 : public P2
@@ -223,6 +231,12 @@ static constexpr bool check_in_in_result{};
 template <typename I1, typename I2>
 static constexpr bool check_in_in_result<std::ranges::in_in_result<I1, I2>> = true;
 
+template <typename T>
+static constexpr bool check_in_out_result{};
+
+template <typename I1, typename O>
+static constexpr bool check_in_out_result<std::ranges::in_out_result<I1, O>> = true;
+
 template <typename T>
 static constexpr bool check_in_in_out_result{};
 
@@ -302,6 +316,109 @@ struct test
 
     using rvalue_container_t = std::array<typename Container::value_type, 0>;
 
+    template <typename T>
+    using TmpContainerType = std::array<T,0>;
+
+    static constexpr int kParts = 2;
+
+    static constexpr int kPaddingSize = 20;
+
+    // Get real range size considering padding for out ranges
+    int get_padded_size(int n)
+    {
+        return n + kPaddingSize * kParts;
+    }
+
+    // Test dangling iterators in return types for call with temporary data
+    template <int idx, typename Policy, typename Algo, typename ...Args>
+    constexpr void
+    test_dangling_pointers_arg_1(Policy&& exec, Algo&& algo, Args&& ...args)
+    {
+        // Check dangling iterators in return types for call with temporary data
+        if constexpr (!supress_dangling_iterators_check<std::remove_cvref_t<decltype(algo)>>)
+        {
+            using T = typename Container::value_type;
+
+            // Check dangling with temporary containers in implementation
+            using res_ret_t = decltype(algo(CLONE_TEST_POLICY_IDX(exec, idx),
+                                            std::declval<TmpContainerType<T>>(),
+                                            args...));
+
+            if constexpr (!std::is_fundamental_v<res_ret_t>)
+            {
+                if constexpr (!all_dangling_in_result_v<res_ret_t>)
+                    static_assert(!std::is_same_v<res_ret_t, res_ret_t>, "res_ret_t is expected to be or consist of std::ranges::dangling");
+            }
+        }
+    }
+
+    // Test dangling iterators in return types for call with temporary data
+    template <int idx, typename Policy, typename Algo, typename ...Args>
+    constexpr void
+    test_dangling_pointers_args_2(Policy&& exec, Algo&& algo, Args&& ...args)
+    {
+        // Check dangling iterators in return types for call with temporary data
+        if constexpr (!supress_dangling_iterators_check<std::remove_cvref_t<decltype(algo)>>)
+        {
+            using T = typename Container::value_type;
+
+            // Check dangling with temporary containers in implementation
+            using res_ret_t = decltype(algo(CLONE_TEST_POLICY_IDX(exec, idx),
+                                            std::declval<TmpContainerType<T>>(),
+                                            std::declval<TmpContainerType<T>>(),
+                                            args...));
+
+            if constexpr (!std::is_fundamental_v<res_ret_t>)
+            {
+                if constexpr (!all_dangling_in_result_v<res_ret_t>)
+                    static_assert(!std::is_same_v<res_ret_t, res_ret_t>, "res_ret_t is expected to be or consist of std::ranges::dangling");
+            }
+        }
+    }
+
+    // Test dangling iterators in return types for call with temporary data
+    template <int idx, typename Policy, typename Algo, typename ...Args>
+    constexpr void
+    test_dangling_pointers_args_3(Policy&& exec, Algo&& algo, Args&& ...args)
+    {
+        // Check dangling iterators in return types for call with temporary data
+        if constexpr (!supress_dangling_iterators_check<std::remove_cvref_t<decltype(algo)>>)
+        {
+            using T = typename Container::value_type;
+
+            // Check dangling with temporary containers in implementation
+            using res_ret_t = decltype(algo(CLONE_TEST_POLICY_IDX(exec, idx),
+                                            std::declval<TmpContainerType<T>>(),
+                                            std::declval<TmpContainerType<T>>(),
+                                            std::declval<TmpContainerType<T>>(),
+                                            args...));
+
+            if constexpr (!std::is_fundamental_v<res_ret_t>)
+            {
+                if constexpr (!all_dangling_in_result_v<res_ret_t>)
+                    static_assert(!std::is_same_v<res_ret_t, res_ret_t>, "res_ret_t is expected to be or consist of std::ranges::dangling");
+            }
+        }
+    }
+
+    // Test dangling iterators in return types for call with temporary data
+    template <std::size_t ArgsSize, int idx, typename Policy, typename Algo, typename ...Args>
+    constexpr void
+    test_dangling_pointers(Policy&& exec, Algo&& algo, Args&& ...args)
+    {
+        static_assert(ArgsSize == 1 || ArgsSize == 2 || ArgsSize == 3,
+                      "The test for dangling pointers is not implemented for this number of algorithm arguments");
+
+        if constexpr (ArgsSize == 1)
+            test_dangling_pointers_arg_1<idx>(std::forward<Policy>(exec), std::forward<Algo>(algo), std::forward<decltype(args)>(args)...);
+
+        else if constexpr (ArgsSize == 2)
+            test_dangling_pointers_args_2<idx>(std::forward<Policy>(exec), std::forward<Algo>(algo), std::forward<decltype(args)>(args)...);
+
+        else if constexpr (ArgsSize == 3)
+            test_dangling_pointers_args_3<idx>(std::forward<Policy>(exec), std::forward<Algo>(algo), std::forward<decltype(args)>(args)...);
+    }
+
     template<typename Policy, typename Algo, typename Checker, typename TransIn>
     void
     process_data_in(int max_n, Policy&& exec, Algo algo, Checker& checker, TransIn tr_in, auto... args)
@@ -353,6 +470,42 @@ struct test
         }
     }
 
+    template <TestDataMode mode, typename View>
+    decltype(auto)
+    get_view_part_for_output_wo_padding(View&& view)
+    {
+        if constexpr (mode == data_in_out_lim || mode == data_in_in_out_lim)
+        {
+            return view | std::views::drop(kPaddingSize) |
+                   std::views::take(std::ranges::size(view) - kPaddingSize * kParts);
+        }
+        else
+        {
+            return std::forward<View>(view);
+        }
+    }
+
+    template <TestDataMode mode, typename View>
+    bool check_padding(View&& view)
+    {
+        if constexpr (mode == data_in_out_lim || mode == data_in_in_out_lim)
+        {
+            for (int idx = 0; idx < kPaddingSize; ++idx)
+            {
+                if (*(view.begin() + idx) != data_gen_unprocessed(idx))
+                    return false;
+            }
+        
+            for (int idx = 0; idx < kPaddingSize; ++idx)
+            {
+                if (*(view.begin() + view.size() - kPaddingSize + idx) != data_gen_unprocessed(idx))
+                    return false;
+            }
+        }
+
+        return true;
+    }    
+
     template<typename Policy, typename Algo, typename Checker, typename TransIn, typename TransOut,
              TestDataMode mode = test_mode>
     void
@@ -368,7 +521,7 @@ struct test
         Container cont_in(exec, n_in, DataGen1{});
         Container cont_in_exp(exec, n_in, DataGen1{});
 
-        Container cont_out(exec, n_out, data_gen_unprocessed);
+        Container cont_out(exec, get_padded_size(n_out), data_gen_unprocessed);
         Container cont_out_exp(exec, n_out, data_gen_unprocessed);
 
         assert(n_in <= max_n);
@@ -379,14 +532,20 @@ struct test
         auto expected_res = checker(in_exp_view, out_exp_view, args...);
 
         typename Container::type& A = cont_in();
-        typename Container::type& B = cont_out();
+        auto&& B_with_padding = cont_out();
+        auto&& B = get_view_part_for_output_wo_padding<mode>(B_with_padding);
 
         auto res = algo(CLONE_TEST_POLICY(exec), tr_in(A), tr_out(B), args...);
 
         // check result types
         static_assert(std::is_same_v<decltype(res), decltype(expected_res)>, "Wrong return type");
 
-        if constexpr (check_in_in_out_result<decltype(expected_res)>)
+        if constexpr (check_in_out_result<decltype(expected_res)>)
+        {
+            EXPECT_EQ(ret_in_val(expected_res, in_exp_view.begin()), ret_in_val(res, tr_in(A).begin()),
+                      (std::string("wrong input stop position with ") + typeid(Algo).name() + sizes).c_str());
+        }
+        else if constexpr (check_in_in_out_result<decltype(expected_res)>)
         {
             EXPECT_EQ(ret_in_val<1>(expected_res, in_exp_view.begin()), ret_in_val<1>(res, tr_in(A).begin()),
                       (std::string("wrong input stop position with ") + names + sizes).c_str());
@@ -411,9 +570,13 @@ struct test
         EXPECT_EQ(ret_out_val(expected_res, out_exp_view.begin()), ret_out_val(res, tr_out(B).begin()),
                   (std::string("wrong output stop position with ") + names + sizes).c_str());
 
+        // Check padding data
+        EXPECT_TRUE(check_padding<mode>(B_with_padding),
+                    (std::string("wrong padding data after algo with ranges: ") + names).c_str());
+
         //check result
         auto n = std::ranges::size(out_exp_view);
-        EXPECT_EQ_N(cont_out_exp().begin(), cont_out().begin(), n, 
+        EXPECT_EQ_N(cont_out_exp().begin(), B.begin(), n, 
                     (std::string("output mismatch with ") + names + sizes).c_str());
 
         //check result
@@ -451,7 +614,7 @@ struct test
         process_data_in_out(max_n, r_size, r_size, CLONE_TEST_POLICY(exec), algo, checker, args...);
 
         //test cases with empty sequence(s)
-	    process_data_in_out(max_n, 0, 0, CLONE_TEST_POLICY(exec), algo, checker, args...);
+        process_data_in_out(max_n, 0, 0, CLONE_TEST_POLICY(exec), algo, checker, args...);
     }
 
     template<typename Policy, typename Algo, typename Checker, TestDataMode mode = test_mode>
@@ -462,8 +625,8 @@ struct test
         process_data_in_out(max_n, r_size, r_size, CLONE_TEST_POLICY(exec), algo, checker, args...);
 
         //test case size of input range is less than size of output and vice-versa
-        process_data_in_out(max_n, r_size/2, r_size, CLONE_TEST_POLICY(exec), algo, checker, args...);
-        process_data_in_out(max_n, r_size, r_size/2, CLONE_TEST_POLICY(exec), algo, checker, args...);
+        process_data_in_out(max_n, r_size / kParts, r_size,          CLONE_TEST_POLICY(exec), algo, checker, args...);
+        process_data_in_out(max_n, r_size,          r_size / kParts, CLONE_TEST_POLICY(exec), algo, checker, args...);
 
         //test cases with empty sequence(s)
         process_data_in_out(max_n, 0, 0, CLONE_TEST_POLICY(exec), algo, checker, args...);
@@ -484,8 +647,8 @@ struct test
         process_data_in_in(max_n, r_size, r_size, CLONE_TEST_POLICY(exec), algo, checker, tr_in, args...);
 
         //test case the sizes of input ranges are different
-        process_data_in_in(max_n, r_size/2, r_size, CLONE_TEST_POLICY(exec), algo, checker, tr_in, args...);
-        process_data_in_in(max_n, r_size, r_size/2, CLONE_TEST_POLICY(exec), algo, checker, tr_in, args...);
+        process_data_in_in(max_n, r_size / kParts, r_size,          CLONE_TEST_POLICY(exec), algo, checker, tr_in, args...);
+        process_data_in_in(max_n, r_size,          r_size / kParts, CLONE_TEST_POLICY(exec), algo, checker, tr_in, args...);
 
         //test cases with empty sequence(s)
         process_data_in_in(max_n, 0, 0, CLONE_TEST_POLICY(exec), algo, checker, tr_in, args...);
@@ -580,7 +743,7 @@ struct test
         Container cont_in1(exec, n_in1, DataGen1{});
         Container cont_in2(exec, n_in2, DataGen2{});
 
-        Container cont_out(exec, n_out, data_gen_unprocessed);
+        Container cont_out(exec, get_padded_size(n_out), data_gen_unprocessed);
         Container cont_exp(exec, n_out, data_gen_unprocessed);
 
         assert(n_in1 <= max_n);
@@ -593,14 +756,24 @@ struct test
 
         typename Container::type& A = cont_in1();
         typename Container::type& B = cont_in2();
-        typename Container::type& C = cont_out();
+        auto&& C_with_padding = cont_out();
+        auto&& C = get_view_part_for_output_wo_padding<mode>(C_with_padding);
 
         auto res = algo(CLONE_TEST_POLICY(exec), tr_in(A), tr_in(B), tr_out(C), args...);
 
+        // Check padding data
+        EXPECT_TRUE(check_padding<mode>(C_with_padding),
+                    (std::string("wrong padding data after algo with ranges: ") + typeid(Algo).name()).c_str());
+
         // check result types
         static_assert(std::is_same_v<decltype(res), decltype(expected_res)>, "Wrong return type");
 
-        if constexpr (check_in_in_out_result<decltype(expected_res)>)
+        if constexpr (check_in_out_result<decltype(expected_res)>)
+        {
+            EXPECT_EQ(ret_in_val(expected_res, src_view1.begin()), ret_in_val(res, tr_in(A).begin()),
+                      (std::string("wrong first input stop position with ") + typeid(Algo).name() + sizes).c_str());
+        }
+        else if constexpr (check_in_in_out_result<decltype(expected_res)>)
         {
             EXPECT_EQ(ret_in_val<1>(expected_res, src_view1.begin()), ret_in_val<1>(res, tr_in(A).begin()),
                       (std::string("wrong first input stop position with ") + typeid(Algo).name() + sizes).c_str());
@@ -621,7 +794,7 @@ struct test
 
         //check result
         auto n = std::ranges::size(expected_view);
-        EXPECT_EQ_N(cont_exp().begin(), cont_out().begin(), n, (std::string("output mismatch with ")
+        EXPECT_EQ_N(cont_exp().begin(), C.begin(), n, (std::string("output mismatch with ")
                     + typeid(Algo).name() + typeid(Policy).name() + sizes).c_str());
 
         if constexpr(!supress_dangling_iterators_check<std::remove_cvref_t<decltype(algo)>>)
@@ -652,12 +825,12 @@ struct test
     operator()(int max_n, Policy&& exec, Algo algo, Checker& checker, auto... args)
     {
         const int r_size = max_n;
-        process_data_in_in_out(max_n, r_size, r_size, r_size*2, CLONE_TEST_POLICY(exec), algo, checker, args...);
+        process_data_in_in_out(max_n, r_size, r_size, r_size * kParts, CLONE_TEST_POLICY(exec), algo, checker, args...);
 
         //test cases with empty sequence(s)
-        process_data_in_in_out(max_n, 0, 0, 0, CLONE_TEST_POLICY(exec), algo, checker, args...);
-        process_data_in_in_out(max_n, 0, r_size, out_size_with_empty_in1<Algo>(r_size), CLONE_TEST_POLICY(exec), algo, checker, args...);
-        process_data_in_in_out(max_n, r_size, 0, out_size_with_empty_in2<Algo>(r_size), CLONE_TEST_POLICY(exec), algo, checker, args...);
+        process_data_in_in_out(max_n, 0,      0,                                          0, CLONE_TEST_POLICY(exec), algo, checker, args...);
+        process_data_in_in_out(max_n, 0,      r_size, out_size_with_empty_in1<Algo>(r_size), CLONE_TEST_POLICY(exec), algo, checker, args...);
+        process_data_in_in_out(max_n, r_size, 0,      out_size_with_empty_in2<Algo>(r_size), CLONE_TEST_POLICY(exec), algo, checker, args...);
     }
 
     template<typename Policy, typename Algo, typename Checker, TestDataMode mode = test_mode>
@@ -665,11 +838,11 @@ struct test
     operator()(int max_n, Policy&& exec, Algo algo, Checker& checker, auto... args)
     {
         const int r_size = max_n;
-        process_data_in_in_out(max_n, r_size, r_size, r_size, CLONE_TEST_POLICY(exec), algo, checker, args...);
-        process_data_in_in_out(max_n, r_size, r_size, r_size*2, CLONE_TEST_POLICY(exec), algo, checker, args...);
-        process_data_in_in_out(max_n, r_size/2, r_size, r_size, CLONE_TEST_POLICY(exec), algo, checker, args...);
-        process_data_in_in_out(max_n, r_size, r_size/2, r_size, CLONE_TEST_POLICY(exec), algo, checker, args...);
-        process_data_in_in_out(max_n, r_size, r_size, r_size/2, CLONE_TEST_POLICY(exec), algo, checker, args...);
+        process_data_in_in_out(max_n, r_size,          r_size,          r_size,          CLONE_TEST_POLICY(exec), algo, checker, args...);
+        process_data_in_in_out(max_n, r_size,          r_size,          r_size * kParts, CLONE_TEST_POLICY(exec), algo, checker, args...);
+        process_data_in_in_out(max_n, r_size / kParts, r_size,          r_size,          CLONE_TEST_POLICY(exec), algo, checker, args...);
+        process_data_in_in_out(max_n, r_size,          r_size / kParts, r_size,          CLONE_TEST_POLICY(exec), algo, checker, args...);
+        process_data_in_in_out(max_n, r_size,          r_size,          r_size / kParts, CLONE_TEST_POLICY(exec), algo, checker, args...);
 
         //test cases with empty sequence(s) and/or zero output capacity
         process_data_in_in_out(max_n, 0, 0, 0, CLONE_TEST_POLICY(exec), algo, checker, args...);
@@ -921,6 +1094,14 @@ struct span_view_fo
 };
 #endif
 
+// TODO remove after implementation range-based set operations for bounded output range with hetero policies
+template <TestDataMode mode>
+struct ResolveTestDataModeForHeteroPolicy
+{
+    static constexpr bool RunTestForHeteroPolicy = true;
+    static constexpr TestDataMode res_mode = mode;
+};
+
 template<int call_id = 0, typename T = int, TestDataMode mode = data_in, typename DataGen1 = std::identity,
          typename DataGen2 = decltype(data_gen2_default)>
 struct test_range_algo
@@ -996,12 +1177,18 @@ struct test_range_algo
             if constexpr(!std::disjunction_v<std::is_member_pointer<decltype(args)>...>)
 #endif
             {
-                test<T, usm_vector<T>,   mode, DataGen1, DataGen2>{}(n_device, CLONE_TEST_POLICY_IDX(exec, call_id + 10), algo, checker, subrange_view,   subrange_view,   args...);
-                test<T, usm_subrange<T>, mode, DataGen1, DataGen2>{}(n_device, CLONE_TEST_POLICY_IDX(exec, call_id + 30), algo, checker, std::identity{}, std::identity{}, args...);
+                if constexpr (ResolveTestDataModeForHeteroPolicy<mode>::RunTestForHeteroPolicy)
+                {
+                    // TODO remove after implementation range-based set operations for bounded output range with hetero policies
+                    constexpr TestDataMode resHeteroMode = ResolveTestDataModeForHeteroPolicy<mode>::res_mode;
+
+                    test<T, usm_vector<T>,   resHeteroMode, DataGen1, DataGen2>{}(n_device, CLONE_TEST_POLICY_IDX(exec, call_id + 10), algo, checker, subrange_view,   subrange_view,   args...);
+                    test<T, usm_subrange<T>, resHeteroMode, DataGen1, DataGen2>{}(n_device, CLONE_TEST_POLICY_IDX(exec, call_id + 30), algo, checker, std::identity{}, std::identity{}, args...);
 #if TEST_CPP20_SPAN_PRESENT
-                test<T, usm_vector<T>,   mode, DataGen1, DataGen2>{}(n_device, CLONE_TEST_POLICY_IDX(exec, call_id + 20), algo, checker, span_view,       subrange_view,   args...);
-                test<T, usm_span<T>,     mode, DataGen1, DataGen2>{}(n_device, CLONE_TEST_POLICY_IDX(exec, call_id + 40), algo, checker, std::identity{}, std::identity{}, args...);
+                    test<T, usm_vector<T>,   resHeteroMode, DataGen1, DataGen2>{}(n_device, CLONE_TEST_POLICY_IDX(exec, call_id + 20), algo, checker, span_view,       subrange_view,   args...);
+                    test<T, usm_span<T>,     resHeteroMode, DataGen1, DataGen2>{}(n_device, CLONE_TEST_POLICY_IDX(exec, call_id + 40), algo, checker, std::identity{}, std::identity{}, args...);
 #endif
+                }
             }
         }
     }
diff --git a/test/support/test_config.h b/test/support/test_config.h
index 7998bfe38e0..99f9ad33577 100644
--- a/test/support/test_config.h
+++ b/test/support/test_config.h
@@ -312,17 +312,6 @@
 #   define _PSTL_ICPX_DEVICE_COPYABLE_SUBMITTER_BROKEN 0
 #endif
 
-// There is a bug in the libc++ at the time of writing this comment with 21 being the latest major release
-// 23 is set to avoid frequent bump-ups.
-// See: https://github.com/llvm/llvm-project/blob/6096d35ea93c75f648a253a00775b4d74915c819/libcxx/include/__algorithm/ranges_set_union.h#L94
-// This line does not take into account that the iterator-based implementation may arbitrary call comp(a, b) or comp(b, a)
-// TODO: report it or contribute.
-#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION <= 230000
-#    define _PSTL_LIBCPP_RANGE_SET_BROKEN 1
-#else
-#    define _PSTL_LIBCPP_RANGE_SET_BROKEN 0
-#endif
-
 // Drop view throws exceptions in libstdc++ 10
 #define _PSTL_LIBSTDCXX_XPU_DROP_VIEW_BROKEN (_GLIBCXX_RELEASE == 10)
 
@@ -346,4 +335,7 @@
 // std::input_iterator and std::output_iterator on the same pre-P2325R3 implementations.
 #define _ONEDPL_CPP20_IN_OUT_ITERATOR_BROKEN TEST_STD_RANGES_VIEW_CONCEPT_REQUIRES_DEFAULT_INITIALIZABLE
 
+// TODO remove after implementation of range-based set operations with hetero policies
+#define STD_RANGES_SET_INTERSECTION_BROKEN_FOR_HETERO_POLICY 1
+
 #endif // _TEST_CONFIG_H
diff --git a/test/support/utils.h b/test/support/utils.h
index 593d83a08b0..ff2425baa97 100644
--- a/test/support/utils.h
+++ b/test/support/utils.h
@@ -160,31 +160,88 @@ std::string log_value_title(TagActual)
     return " got ";
 }
 
+template <typename TStream, typename TValue>
+std::enable_if_t<!IsOutputStreamable<TValue, TStream>::value>
+log_value_to_stream(TStream& os, const TValue&, bool& commaNeeded)
+{
+    if (commaNeeded)
+        os << ",";
+
+    os << "(unable to log value)";
+
+    commaNeeded = true;
+}
+
+template <typename TValue>
+constexpr bool is_any_char_type_v =
+    std::is_same_v<TValue, char> || std::is_same_v<TValue, signed char> || std::is_same_v<TValue, unsigned char>
+#if defined(__cpp_char8_t)
+    || std::is_same_v<TValue, char8_t>
+#endif
+    || std::is_same_v<TValue, char16_t> || std::is_same_v<TValue, char32_t>;
+
+template <typename TStream, typename TValue>
+std::enable_if_t<IsOutputStreamable<TValue, TStream>::value && !std::is_enum_v<TValue> && !is_any_char_type_v<TValue>>
+log_value_to_stream(TStream& os, const TValue& value, bool& commaNeeded)
+{
+    if (commaNeeded)
+        os << ",";
+
+    os << std::boolalpha << value << std::noboolalpha;
+
+    commaNeeded = true;
+}
+
+template <typename TStream, typename TValue>
+std::enable_if_t<IsOutputStreamable<TValue, TStream>::value && is_any_char_type_v<TValue>>
+log_value_to_stream(TStream& os, const TValue& value, bool& commaNeeded)
+{
+    if (commaNeeded)
+        os << ",";
+
+    os << static_cast<std::uint32_t>(value);
+
+    commaNeeded = true;
+}
+
+template <typename TStream, typename TValue>
+std::enable_if_t<IsOutputStreamable<TValue, TStream>::value && std::is_enum_v<TValue> && !is_any_char_type_v<TValue>>
+log_value_to_stream(TStream& os, const TValue& value, bool& commaNeeded)
+{
+    log_value_to_stream(os, static_cast<std::underlying_type_t<TValue>>(value), commaNeeded);
+}
+
+template <typename TStream, typename... T>
+void
+log_value_to_stream(TStream& os, const oneapi::dpl::__internal::tuple<T...>& value, bool& commaNeeded)
+{
+    using std_tuple_t = typename oneapi::dpl::__internal::tuple<T...>::tuple_type;
+    std_tuple_t std_tuple = value;
+
+    if (commaNeeded)
+        os << ",";
+
+    bool bInternalCommaNeeded = false;
+    os << "(";
+    std::apply([&os, &bInternalCommaNeeded](
+                   const auto&... elems) { (log_value_to_stream(os, elems, bInternalCommaNeeded), ...); },
+               std_tuple);
+    os << ")";
+
+    commaNeeded = true;
+}
+
 template <typename TStream, typename Tag, typename TValue>
- void log_value(TStream& os, Tag, const TValue& value, bool bCommaNeeded)
+void
+log_value(TStream& os, Tag, const TValue& value, bool commaNeeded)
 {
-    if (bCommaNeeded)
+    if (commaNeeded)
         os << ",";
+
     os << log_value_title(Tag{});
 
-    if constexpr (IsOutputStreamable<TValue, decltype(os)>::value)
-    {
-        if constexpr (std::is_same_v<bool, std::decay_t<TValue>>)
-        {
-            if (value)
-                os << "true";
-            else
-                os << "false";
-        }
-        else
-        {
-            os << value;
-        }
-    }
-    else
-    {
-        os << "(unable to log value)";
-    }
+    bool bInternalCommaNeeded = false;
+    log_value_to_stream(os, value, bInternalCommaNeeded);
 }
 
 // Do not change signature to const T&.
@@ -1421,6 +1478,47 @@ struct DefaultInitializedToOne
     }
 };
 
+// The idea of this struct is to have a data item that can be used in set tests.
+// Each item has a value, an index in container and a container number.
+// This will allow us to test if the set-algorithms work correctly with sets of data.
+template <typename T>
+struct SetDataItem
+{
+    T value{};                      // Value of the item
+    std::size_t index = 0;          // Index of the item in the container
+    std::size_t series = 0;         // Container number
+
+    friend bool
+    operator==(const SetDataItem& item1, const SetDataItem& item2)
+    {
+        return item1.value == item2.value && item1.index == item2.index && item1.series == item2.series;
+    }
+
+    template <typename OStream>
+    friend OStream&
+    operator<<(OStream& os, const SetDataItem& item)
+    {
+        os << "{ value = " << item.value << ", index = " << item.index << ", series = " << item.series << "}";
+        return os;
+    }
+};
+
+// Projection to extract 'value' field from SetDataItem
+struct SetDataItemProj
+{
+    template <typename T>
+    decltype (auto)
+    operator()(const SetDataItem<T>& item) const
+    {
+        // Parentheses are required for correct decltype(auto) deduction:
+        // - without them: decltype(item.value) => T (copy, declared member type)
+        // - with them:    decltype((item.value)) => const T& (lvalue expression)
+        // This ensures the projection returns a reference, not a copy,
+        // to test that algorithms work correctly with reference-returning projections.
+        return (item.value);
+    }
+};
+
 } /* namespace TestUtils */
 
 #if _ENABLE_STD_RANGES_TESTING