diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index d5752fec52..227d380a5b 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -17,7 +17,7 @@ from devito.passes.iet.langbase import ( DeviceAwareMixin, LangBB, LangTransformer, ShmTransformer, make_sections_from_imask ) -from devito.symbolics import INT +from devito.symbolics import INT, as_long from devito.tools import as_tuple, flatten, is_integer, prod from devito.types import Symbol @@ -491,8 +491,17 @@ def expr_symbols(self): @cached_property def _generate(self): - # Stringify sections - sections = ''.join([f'[{ccode(i)}:{ccode(j)}]' + # Stringify sections. When a transfer is linearized, the section size + # is a product of the Function's 32-bit per-dimension sizes (e.g. + # `size[0]*size[1]*size[2]`); for a Function with more than ~2**31 + # elements this product overflows `int` before being used as an array + # bound, producing a bogus transfer size (#2777). Promote such products + # to 64-bit with `as_long` so the multiplication is carried out in + # 64-bit arithmetic. Non-product bounds (a single size, an offset, a + # constant) cannot overflow and are left untouched. + def cast(e): + return as_long(e) if getattr(e, 'is_Mul', False) else e + sections = ''.join([f'[{ccode(cast(i))}:{ccode(cast(j))}]' for i, j in self.sections]) arguments = [ccode(i) for i in self.arguments] return self.pragma % (self.function.name, sections, *arguments) diff --git a/devito/symbolics/manipulation.py b/devito/symbolics/manipulation.py index 57d9314e16..a212053a07 100644 --- a/devito/symbolics/manipulation.py +++ b/devito/symbolics/manipulation.py @@ -11,7 +11,8 @@ from devito.symbolics.extended_dtypes import LONG from devito.symbolics.extended_sympy import DefFunction, rfunc from devito.symbolics.queries import q_leaf -from devito.symbolics.search import retrieve_functions, retrieve_indexed, retrieve_symbols +from devito.symbolics.search import (retrieve_functions, retrieve_indexed, + retrieve_terminals) from devito.symbolics.unevaluation import Add as UnevalAdd from devito.symbolics.unevaluation import Mul as UnevalMul from devito.symbolics.unevaluation import Pow as UnevalPow @@ -547,7 +548,10 @@ def as_long(expr): Convert an expression and its symbolic args to a long integer. """ try: - syms = retrieve_symbols(expr) - return expr.subs({s: LONG(s) for s in syms}) + # Cast every symbolic leaf, including Indexeds and IndexedPointers + # (e.g. ``vec->size[i]``), not just plain Symbols, so that products of + # such leaves are evaluated in 64-bit arithmetic + terminals = retrieve_terminals(expr) + return expr.subs({s: LONG(s) for s in terminals}) except AttributeError: return LONG(expr) diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index e84d0df5d8..29e628cc87 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -245,6 +245,28 @@ def test_linearize(self): op.apply(time_M=10) assert np.all(u.data[1] == 11) + def test_linearize_transfer_no_overflow(self): + # When a transfer is linearized, its size is a product of the + # Function's per-dimension sizes (e.g. `size[0]*size[1]*size[2]`). + # These are 32-bit C ints, so for a Function with more than ~2**31 + # elements the product overflows `int` before being used as the + # transfer bound, producing a bogus size (issue #2777). Each factor + # must be cast to a 64-bit int so the product is computed in 64-bit. + grid = Grid(shape=(4, 5, 6)) + + u = TimeFunction(name='u', grid=grid) + + op = Operator(Eq(u.forward, u + 1), opt=('advanced', {'linearize': True})) + + # The transfer bound is a product of the four `size[i]`, each cast to + # `long`; the multiplication is thus carried out in 64-bit arithmetic + for transfer in op.body.maps + op.body.unmaps: + code = transfer.ccode.value + for i in range(4): + assert f'(long)(u_vec->size[{i}])' in code + # No un-cast `size[i]*` product (which would overflow in 32-bit) + assert 'u_vec->size[0]*' not in code + class TestPassesEdgeCases: diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py index d6505a6f71..98bc8e170a 100644 --- a/tests/test_gpu_openmp.py +++ b/tests/test_gpu_openmp.py @@ -54,14 +54,15 @@ def test_basic(self): assert trees[0][1].pragmas[0].ccode.value ==\ 'omp target teams distribute parallel for collapse(3)' assert op.body.maps[0].ccode.value ==\ - ('omp target enter data map(to: u[0:u_vec->size[0]*' - 'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])') + ('omp target enter data map(to: u[0:(long)(u_vec->size[3])*' + '(long)(u_vec->size[2])*(long)(u_vec->size[1])*(long)(u_vec->size[0])])') assert op.body.unmaps[0].ccode.value ==\ - ('omp target update from(u[0:u_vec->size[0]*' - 'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])') + ('omp target update from(u[0:(long)(u_vec->size[3])*' + '(long)(u_vec->size[2])*(long)(u_vec->size[1])*(long)(u_vec->size[0])])') assert op.body.unmaps[1].ccode.value ==\ - ('omp target exit data map(release: u[0:u_vec->size[0]*' - 'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]]) if(devicerm)') + ('omp target exit data map(release: u[0:(long)(u_vec->size[3])*' + '(long)(u_vec->size[2])*(long)(u_vec->size[1])*(long)(u_vec->size[0])]) ' + 'if(devicerm)') # Currently, advanced-fsg mode == advanced mode op1 = Operator(Eq(u.forward, u + 1), language='openmp', opt='advanced-fsg') @@ -125,14 +126,17 @@ def test_multiple_eqns(self): 'omp target teams distribute parallel for collapse(3)' for i, f in enumerate([u, v]): assert op.body.maps[i].ccode.value ==\ - (f'omp target enter data map(to: {f.name}[0:{f.name}_vec->size[0]*' - f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]])') + (f'omp target enter data map(to: {f.name}' + f'[0:(long)({f.name}_vec->size[3])*(long)({f.name}_vec->size[2])*' + f'(long)({f.name}_vec->size[1])*(long)({f.name}_vec->size[0])])') assert op.body.unmaps[2*i + 0].ccode.value ==\ - (f'omp target update from({f.name}[0:{f.name}_vec->size[0]*' - f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]])') + (f'omp target update from({f.name}' + f'[0:(long)({f.name}_vec->size[3])*(long)({f.name}_vec->size[2])*' + f'(long)({f.name}_vec->size[1])*(long)({f.name}_vec->size[0])])') assert op.body.unmaps[2*i + 1].ccode.value ==\ - (f'omp target exit data map(release: {f.name}[0:{f.name}_vec->size[0]*' - f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]]) ' + (f'omp target exit data map(release: {f.name}' + f'[0:(long)({f.name}_vec->size[3])*(long)({f.name}_vec->size[2])*' + f'(long)({f.name}_vec->size[1])*(long)({f.name}_vec->size[0])]) ' 'if(devicerm)') def test_multiple_loops(self):