From 60496a818b87f898de9b220387d026ff25e5c54d Mon Sep 17 00:00:00 2001 From: gaoflow Date: Fri, 29 May 2026 04:07:24 +0200 Subject: [PATCH 1/2] compiler: Avoid int32 overflow in linearized host-device transfer size When a host-device data transfer is linearized, its array section size is emitted as a product of the Function's per-dimension sizes, e.g. `copyin(u[0:u_vec->size[0]*u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])`. The `size[i]` fields are 32-bit C ints, so for a Function with more than ~2**31 elements the product overflows `int` before it is used as the transfer bound, yielding a bogus size and a corrupt/failed device transfer. Cast each factor of the product to a 64-bit integer so the multiplication is carried out in 64-bit arithmetic. Casting the whole product would be too late (the overflow would already have occurred), so each factor is cast individually. Non-product bounds (a single size, an offset, a constant) cannot overflow and are left untouched, as are non-transfer expressions. Fixes #2777 --- devito/passes/iet/parpragma.py | 23 +++++++++++++++++++++-- tests/test_gpu_common.py | 22 ++++++++++++++++++++++ tests/test_gpu_openmp.py | 28 ++++++++++++++++------------ 3 files changed, 59 insertions(+), 14 deletions(-) diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index d5752fec52..f14f5ccb4d 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -17,7 +17,7 @@ from devito.passes.iet.langbase import ( DeviceAwareMixin, LangBB, LangTransformer, ShmTransformer, make_sections_from_imask ) -from devito.symbolics import INT +from devito.symbolics import INT, LONG from devito.tools import as_tuple, flatten, is_integer, prod from devito.types import Symbol @@ -450,6 +450,25 @@ def make_parallel(self, graph, **kwargs): return self._make_parallel(graph, sync_mapper=graph.sync_mapper) +def _avoid_overflow(expr): + """ + The bounds of a host-device transfer section may be a product of the + Function's per-dimension sizes (e.g. ``size[0]*size[1]*size[2]``), as + happens when the transferred data is flattened (linearized). These sizes + are 32-bit C ints, so for a sufficiently large Function (more than ~2**31 + elements) the product overflows `int` before it is used as an array bound, + producing a bogus transfer size (see issue #2777). Cast each factor of the + product to a 64-bit integer so the multiplication is carried out in 64-bit + arithmetic. A cast on the whole product would be too late (the overflow + would already have happened), hence each factor is cast individually. + Non-product bounds (a single size, an offset, a constant) cannot overflow + and are left untouched. + """ + if getattr(expr, 'is_Mul', False): + return expr.func(*[LONG(a) for a in expr.args]) + return expr + + class PragmaTransfer(Pragma, Transfer): """ @@ -492,7 +511,7 @@ def expr_symbols(self): @cached_property def _generate(self): # Stringify sections - sections = ''.join([f'[{ccode(i)}:{ccode(j)}]' + sections = ''.join([f'[{ccode(_avoid_overflow(i))}:{ccode(_avoid_overflow(j))}]' for i, j in self.sections]) arguments = [ccode(i) for i in self.arguments] return self.pragma % (self.function.name, sections, *arguments) diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index e84d0df5d8..29e628cc87 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -245,6 +245,28 @@ def test_linearize(self): op.apply(time_M=10) assert np.all(u.data[1] == 11) + def test_linearize_transfer_no_overflow(self): + # When a transfer is linearized, its size is a product of the + # Function's per-dimension sizes (e.g. `size[0]*size[1]*size[2]`). + # These are 32-bit C ints, so for a Function with more than ~2**31 + # elements the product overflows `int` before being used as the + # transfer bound, producing a bogus size (issue #2777). Each factor + # must be cast to a 64-bit int so the product is computed in 64-bit. + grid = Grid(shape=(4, 5, 6)) + + u = TimeFunction(name='u', grid=grid) + + op = Operator(Eq(u.forward, u + 1), opt=('advanced', {'linearize': True})) + + # The transfer bound is a product of the four `size[i]`, each cast to + # `long`; the multiplication is thus carried out in 64-bit arithmetic + for transfer in op.body.maps + op.body.unmaps: + code = transfer.ccode.value + for i in range(4): + assert f'(long)(u_vec->size[{i}])' in code + # No un-cast `size[i]*` product (which would overflow in 32-bit) + assert 'u_vec->size[0]*' not in code + class TestPassesEdgeCases: diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py index d6505a6f71..aba82257ae 100644 --- a/tests/test_gpu_openmp.py +++ b/tests/test_gpu_openmp.py @@ -54,14 +54,15 @@ def test_basic(self): assert trees[0][1].pragmas[0].ccode.value ==\ 'omp target teams distribute parallel for collapse(3)' assert op.body.maps[0].ccode.value ==\ - ('omp target enter data map(to: u[0:u_vec->size[0]*' - 'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])') + ('omp target enter data map(to: u[0:(long)(u_vec->size[0])*' + '(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])])') assert op.body.unmaps[0].ccode.value ==\ - ('omp target update from(u[0:u_vec->size[0]*' - 'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])') + ('omp target update from(u[0:(long)(u_vec->size[0])*' + '(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])])') assert op.body.unmaps[1].ccode.value ==\ - ('omp target exit data map(release: u[0:u_vec->size[0]*' - 'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]]) if(devicerm)') + ('omp target exit data map(release: u[0:(long)(u_vec->size[0])*' + '(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])]) ' + 'if(devicerm)') # Currently, advanced-fsg mode == advanced mode op1 = Operator(Eq(u.forward, u + 1), language='openmp', opt='advanced-fsg') @@ -125,14 +126,17 @@ def test_multiple_eqns(self): 'omp target teams distribute parallel for collapse(3)' for i, f in enumerate([u, v]): assert op.body.maps[i].ccode.value ==\ - (f'omp target enter data map(to: {f.name}[0:{f.name}_vec->size[0]*' - f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]])') + (f'omp target enter data map(to: {f.name}' + f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*' + f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])])') assert op.body.unmaps[2*i + 0].ccode.value ==\ - (f'omp target update from({f.name}[0:{f.name}_vec->size[0]*' - f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]])') + (f'omp target update from({f.name}' + f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*' + f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])])') assert op.body.unmaps[2*i + 1].ccode.value ==\ - (f'omp target exit data map(release: {f.name}[0:{f.name}_vec->size[0]*' - f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]]) ' + (f'omp target exit data map(release: {f.name}' + f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*' + f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])]) ' 'if(devicerm)') def test_multiple_loops(self): From 74ee8dbb7e45b05abb25be08f058000a5e3c5c0f Mon Sep 17 00:00:00 2001 From: gaoflow Date: Fri, 29 May 2026 10:51:07 +0200 Subject: [PATCH 2/2] compiler: Use as_long for transfer size cast, fix it for Indexed leaves Address review: replace the ad-hoc _avoid_overflow helper with the existing as_long. as_long only substituted plain Symbols (retrieve_symbols), so it was a no-op on the IndexedPointer size factors (vec->size[i]) of a linearized transfer bound; extend it to retrieve_terminals so Indexed/IndexedPointer leaves are cast too. Keep the cast scoped to Mul products in PragmaTransfer so non-linearized multi-dimensional sections are not needlessly upcast. --- devito/passes/iet/parpragma.py | 34 +++++++++++--------------------- devito/symbolics/manipulation.py | 10 +++++++--- tests/test_gpu_openmp.py | 24 +++++++++++----------- 3 files changed, 31 insertions(+), 37 deletions(-) diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index f14f5ccb4d..227d380a5b 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -17,7 +17,7 @@ from devito.passes.iet.langbase import ( DeviceAwareMixin, LangBB, LangTransformer, ShmTransformer, make_sections_from_imask ) -from devito.symbolics import INT, LONG +from devito.symbolics import INT, as_long from devito.tools import as_tuple, flatten, is_integer, prod from devito.types import Symbol @@ -450,25 +450,6 @@ def make_parallel(self, graph, **kwargs): return self._make_parallel(graph, sync_mapper=graph.sync_mapper) -def _avoid_overflow(expr): - """ - The bounds of a host-device transfer section may be a product of the - Function's per-dimension sizes (e.g. ``size[0]*size[1]*size[2]``), as - happens when the transferred data is flattened (linearized). These sizes - are 32-bit C ints, so for a sufficiently large Function (more than ~2**31 - elements) the product overflows `int` before it is used as an array bound, - producing a bogus transfer size (see issue #2777). Cast each factor of the - product to a 64-bit integer so the multiplication is carried out in 64-bit - arithmetic. A cast on the whole product would be too late (the overflow - would already have happened), hence each factor is cast individually. - Non-product bounds (a single size, an offset, a constant) cannot overflow - and are left untouched. - """ - if getattr(expr, 'is_Mul', False): - return expr.func(*[LONG(a) for a in expr.args]) - return expr - - class PragmaTransfer(Pragma, Transfer): """ @@ -510,8 +491,17 @@ def expr_symbols(self): @cached_property def _generate(self): - # Stringify sections - sections = ''.join([f'[{ccode(_avoid_overflow(i))}:{ccode(_avoid_overflow(j))}]' + # Stringify sections. When a transfer is linearized, the section size + # is a product of the Function's 32-bit per-dimension sizes (e.g. + # `size[0]*size[1]*size[2]`); for a Function with more than ~2**31 + # elements this product overflows `int` before being used as an array + # bound, producing a bogus transfer size (#2777). Promote such products + # to 64-bit with `as_long` so the multiplication is carried out in + # 64-bit arithmetic. Non-product bounds (a single size, an offset, a + # constant) cannot overflow and are left untouched. + def cast(e): + return as_long(e) if getattr(e, 'is_Mul', False) else e + sections = ''.join([f'[{ccode(cast(i))}:{ccode(cast(j))}]' for i, j in self.sections]) arguments = [ccode(i) for i in self.arguments] return self.pragma % (self.function.name, sections, *arguments) diff --git a/devito/symbolics/manipulation.py b/devito/symbolics/manipulation.py index 57d9314e16..a212053a07 100644 --- a/devito/symbolics/manipulation.py +++ b/devito/symbolics/manipulation.py @@ -11,7 +11,8 @@ from devito.symbolics.extended_dtypes import LONG from devito.symbolics.extended_sympy import DefFunction, rfunc from devito.symbolics.queries import q_leaf -from devito.symbolics.search import retrieve_functions, retrieve_indexed, retrieve_symbols +from devito.symbolics.search import (retrieve_functions, retrieve_indexed, + retrieve_terminals) from devito.symbolics.unevaluation import Add as UnevalAdd from devito.symbolics.unevaluation import Mul as UnevalMul from devito.symbolics.unevaluation import Pow as UnevalPow @@ -547,7 +548,10 @@ def as_long(expr): Convert an expression and its symbolic args to a long integer. """ try: - syms = retrieve_symbols(expr) - return expr.subs({s: LONG(s) for s in syms}) + # Cast every symbolic leaf, including Indexeds and IndexedPointers + # (e.g. ``vec->size[i]``), not just plain Symbols, so that products of + # such leaves are evaluated in 64-bit arithmetic + terminals = retrieve_terminals(expr) + return expr.subs({s: LONG(s) for s in terminals}) except AttributeError: return LONG(expr) diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py index aba82257ae..98bc8e170a 100644 --- a/tests/test_gpu_openmp.py +++ b/tests/test_gpu_openmp.py @@ -54,14 +54,14 @@ def test_basic(self): assert trees[0][1].pragmas[0].ccode.value ==\ 'omp target teams distribute parallel for collapse(3)' assert op.body.maps[0].ccode.value ==\ - ('omp target enter data map(to: u[0:(long)(u_vec->size[0])*' - '(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])])') + ('omp target enter data map(to: u[0:(long)(u_vec->size[3])*' + '(long)(u_vec->size[2])*(long)(u_vec->size[1])*(long)(u_vec->size[0])])') assert op.body.unmaps[0].ccode.value ==\ - ('omp target update from(u[0:(long)(u_vec->size[0])*' - '(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])])') + ('omp target update from(u[0:(long)(u_vec->size[3])*' + '(long)(u_vec->size[2])*(long)(u_vec->size[1])*(long)(u_vec->size[0])])') assert op.body.unmaps[1].ccode.value ==\ - ('omp target exit data map(release: u[0:(long)(u_vec->size[0])*' - '(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])]) ' + ('omp target exit data map(release: u[0:(long)(u_vec->size[3])*' + '(long)(u_vec->size[2])*(long)(u_vec->size[1])*(long)(u_vec->size[0])]) ' 'if(devicerm)') # Currently, advanced-fsg mode == advanced mode @@ -127,16 +127,16 @@ def test_multiple_eqns(self): for i, f in enumerate([u, v]): assert op.body.maps[i].ccode.value ==\ (f'omp target enter data map(to: {f.name}' - f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*' - f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])])') + f'[0:(long)({f.name}_vec->size[3])*(long)({f.name}_vec->size[2])*' + f'(long)({f.name}_vec->size[1])*(long)({f.name}_vec->size[0])])') assert op.body.unmaps[2*i + 0].ccode.value ==\ (f'omp target update from({f.name}' - f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*' - f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])])') + f'[0:(long)({f.name}_vec->size[3])*(long)({f.name}_vec->size[2])*' + f'(long)({f.name}_vec->size[1])*(long)({f.name}_vec->size[0])])') assert op.body.unmaps[2*i + 1].ccode.value ==\ (f'omp target exit data map(release: {f.name}' - f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*' - f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])]) ' + f'[0:(long)({f.name}_vec->size[3])*(long)({f.name}_vec->size[2])*' + f'(long)({f.name}_vec->size[1])*(long)({f.name}_vec->size[0])]) ' 'if(devicerm)') def test_multiple_loops(self):