Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions devito/passes/iet/parpragma.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from devito.passes.iet.langbase import (
DeviceAwareMixin, LangBB, LangTransformer, ShmTransformer, make_sections_from_imask
)
from devito.symbolics import INT
from devito.symbolics import INT, as_long
from devito.tools import as_tuple, flatten, is_integer, prod
from devito.types import Symbol

Expand Down Expand Up @@ -491,8 +491,17 @@ def expr_symbols(self):

@cached_property
def _generate(self):
# Stringify sections
sections = ''.join([f'[{ccode(i)}:{ccode(j)}]'
# Stringify sections. When a transfer is linearized, the section size
# is a product of the Function's 32-bit per-dimension sizes (e.g.
# `size[0]*size[1]*size[2]`); for a Function with more than ~2**31
# elements this product overflows `int` before being used as an array
# bound, producing a bogus transfer size (#2777). Promote such products
# to 64-bit with `as_long` so the multiplication is carried out in
# 64-bit arithmetic. Non-product bounds (a single size, an offset, a
# constant) cannot overflow and are left untouched.
def cast(e):
return as_long(e) if getattr(e, 'is_Mul', False) else e
sections = ''.join([f'[{ccode(cast(i))}:{ccode(cast(j))}]'
for i, j in self.sections])
arguments = [ccode(i) for i in self.arguments]
return self.pragma % (self.function.name, sections, *arguments)
Expand Down
10 changes: 7 additions & 3 deletions devito/symbolics/manipulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from devito.symbolics.extended_dtypes import LONG
from devito.symbolics.extended_sympy import DefFunction, rfunc
from devito.symbolics.queries import q_leaf
from devito.symbolics.search import retrieve_functions, retrieve_indexed, retrieve_symbols
from devito.symbolics.search import (retrieve_functions, retrieve_indexed,
retrieve_terminals)
from devito.symbolics.unevaluation import Add as UnevalAdd
from devito.symbolics.unevaluation import Mul as UnevalMul
from devito.symbolics.unevaluation import Pow as UnevalPow
Expand Down Expand Up @@ -547,7 +548,10 @@ def as_long(expr):
Convert an expression and its symbolic args to a long integer.
"""
try:
syms = retrieve_symbols(expr)
return expr.subs({s: LONG(s) for s in syms})
# Cast every symbolic leaf, including Indexeds and IndexedPointers
# (e.g. ``vec->size[i]``), not just plain Symbols, so that products of
# such leaves are evaluated in 64-bit arithmetic
terminals = retrieve_terminals(expr)
return expr.subs({s: LONG(s) for s in terminals})
except AttributeError:
return LONG(expr)
22 changes: 22 additions & 0 deletions tests/test_gpu_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,28 @@ def test_linearize(self):
op.apply(time_M=10)
assert np.all(u.data[1] == 11)

def test_linearize_transfer_no_overflow(self):
# When a transfer is linearized, its size is a product of the
# Function's per-dimension sizes (e.g. `size[0]*size[1]*size[2]`).
# These are 32-bit C ints, so for a Function with more than ~2**31
# elements the product overflows `int` before being used as the
# transfer bound, producing a bogus size (issue #2777). Each factor
# must be cast to a 64-bit int so the product is computed in 64-bit.
grid = Grid(shape=(4, 5, 6))

u = TimeFunction(name='u', grid=grid)

op = Operator(Eq(u.forward, u + 1), opt=('advanced', {'linearize': True}))

# The transfer bound is a product of the four `size[i]`, each cast to
# `long`; the multiplication is thus carried out in 64-bit arithmetic
for transfer in op.body.maps + op.body.unmaps:
code = transfer.ccode.value
for i in range(4):
assert f'(long)(u_vec->size[{i}])' in code
# No un-cast `size[i]*` product (which would overflow in 32-bit)
assert 'u_vec->size[0]*' not in code


class TestPassesEdgeCases:

Expand Down
28 changes: 16 additions & 12 deletions tests/test_gpu_openmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,15 @@ def test_basic(self):
assert trees[0][1].pragmas[0].ccode.value ==\
'omp target teams distribute parallel for collapse(3)'
assert op.body.maps[0].ccode.value ==\
('omp target enter data map(to: u[0:u_vec->size[0]*'
'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])')
('omp target enter data map(to: u[0:(long)(u_vec->size[3])*'
'(long)(u_vec->size[2])*(long)(u_vec->size[1])*(long)(u_vec->size[0])])')
assert op.body.unmaps[0].ccode.value ==\
('omp target update from(u[0:u_vec->size[0]*'
'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])')
('omp target update from(u[0:(long)(u_vec->size[3])*'
'(long)(u_vec->size[2])*(long)(u_vec->size[1])*(long)(u_vec->size[0])])')
assert op.body.unmaps[1].ccode.value ==\
('omp target exit data map(release: u[0:u_vec->size[0]*'
'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]]) if(devicerm)')
('omp target exit data map(release: u[0:(long)(u_vec->size[3])*'
'(long)(u_vec->size[2])*(long)(u_vec->size[1])*(long)(u_vec->size[0])]) '
'if(devicerm)')

# Currently, advanced-fsg mode == advanced mode
op1 = Operator(Eq(u.forward, u + 1), language='openmp', opt='advanced-fsg')
Expand Down Expand Up @@ -125,14 +126,17 @@ def test_multiple_eqns(self):
'omp target teams distribute parallel for collapse(3)'
for i, f in enumerate([u, v]):
assert op.body.maps[i].ccode.value ==\
(f'omp target enter data map(to: {f.name}[0:{f.name}_vec->size[0]*'
f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]])')
(f'omp target enter data map(to: {f.name}'
f'[0:(long)({f.name}_vec->size[3])*(long)({f.name}_vec->size[2])*'
f'(long)({f.name}_vec->size[1])*(long)({f.name}_vec->size[0])])')
assert op.body.unmaps[2*i + 0].ccode.value ==\
(f'omp target update from({f.name}[0:{f.name}_vec->size[0]*'
f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]])')
(f'omp target update from({f.name}'
f'[0:(long)({f.name}_vec->size[3])*(long)({f.name}_vec->size[2])*'
f'(long)({f.name}_vec->size[1])*(long)({f.name}_vec->size[0])])')
assert op.body.unmaps[2*i + 1].ccode.value ==\
(f'omp target exit data map(release: {f.name}[0:{f.name}_vec->size[0]*'
f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]]) '
(f'omp target exit data map(release: {f.name}'
f'[0:(long)({f.name}_vec->size[3])*(long)({f.name}_vec->size[2])*'
f'(long)({f.name}_vec->size[1])*(long)({f.name}_vec->size[0])]) '
'if(devicerm)')

def test_multiple_loops(self):
Expand Down