From 60496a818b87f898de9b220387d026ff25e5c54d Mon Sep 17 00:00:00 2001
From: gaoflow <gaobing1230@gmail.com>
Date: Fri, 29 May 2026 04:07:24 +0200
Subject: [PATCH 1/2] compiler: Avoid int32 overflow in linearized host-device
 transfer size

When a host-device data transfer is linearized, its array section size is
emitted as a product of the Function's per-dimension sizes, e.g.
`copyin(u[0:u_vec->size[0]*u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])`.
The `size[i]` fields are 32-bit C ints, so for a Function with more than
~2**31 elements the product overflows `int` before it is used as the
transfer bound, yielding a bogus size and a corrupt/failed device transfer.

Cast each factor of the product to a 64-bit integer so the multiplication is
carried out in 64-bit arithmetic. Casting the whole product would be too late
(the overflow would already have occurred), so each factor is cast
individually. Non-product bounds (a single size, an offset, a constant) cannot
overflow and are left untouched, as are non-transfer expressions.

Fixes #2777
---
 devito/passes/iet/parpragma.py | 23 +++++++++++++++++++++--
 tests/test_gpu_common.py       | 22 ++++++++++++++++++++++
 tests/test_gpu_openmp.py       | 28 ++++++++++++++++------------
 3 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py
index d5752fec52..f14f5ccb4d 100644
--- a/devito/passes/iet/parpragma.py
+++ b/devito/passes/iet/parpragma.py
@@ -17,7 +17,7 @@
 from devito.passes.iet.langbase import (
     DeviceAwareMixin, LangBB, LangTransformer, ShmTransformer, make_sections_from_imask
 )
-from devito.symbolics import INT
+from devito.symbolics import INT, LONG
 from devito.tools import as_tuple, flatten, is_integer, prod
 from devito.types import Symbol
 
@@ -450,6 +450,25 @@ def make_parallel(self, graph, **kwargs):
         return self._make_parallel(graph, sync_mapper=graph.sync_mapper)
 
 
+def _avoid_overflow(expr):
+    """
+    The bounds of a host-device transfer section may be a product of the
+    Function's per-dimension sizes (e.g. ``size[0]*size[1]*size[2]``), as
+    happens when the transferred data is flattened (linearized). These sizes
+    are 32-bit C ints, so for a sufficiently large Function (more than ~2**31
+    elements) the product overflows `int` before it is used as an array bound,
+    producing a bogus transfer size (see issue #2777). Cast each factor of the
+    product to a 64-bit integer so the multiplication is carried out in 64-bit
+    arithmetic. A cast on the whole product would be too late (the overflow
+    would already have happened), hence each factor is cast individually.
+    Non-product bounds (a single size, an offset, a constant) cannot overflow
+    and are left untouched.
+    """
+    if getattr(expr, 'is_Mul', False):
+        return expr.func(*[LONG(a) for a in expr.args])
+    return expr
+
+
 class PragmaTransfer(Pragma, Transfer):
 
     """
@@ -492,7 +511,7 @@ def expr_symbols(self):
     @cached_property
     def _generate(self):
         # Stringify sections
-        sections = ''.join([f'[{ccode(i)}:{ccode(j)}]'
+        sections = ''.join([f'[{ccode(_avoid_overflow(i))}:{ccode(_avoid_overflow(j))}]'
                             for i, j in self.sections])
         arguments = [ccode(i) for i in self.arguments]
         return self.pragma % (self.function.name, sections, *arguments)
diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py
index e84d0df5d8..29e628cc87 100644
--- a/tests/test_gpu_common.py
+++ b/tests/test_gpu_common.py
@@ -245,6 +245,28 @@ def test_linearize(self):
         op.apply(time_M=10)
         assert np.all(u.data[1] == 11)
 
+    def test_linearize_transfer_no_overflow(self):
+        # When a transfer is linearized, its size is a product of the
+        # Function's per-dimension sizes (e.g. `size[0]*size[1]*size[2]`).
+        # These are 32-bit C ints, so for a Function with more than ~2**31
+        # elements the product overflows `int` before being used as the
+        # transfer bound, producing a bogus size (issue #2777). Each factor
+        # must be cast to a 64-bit int so the product is computed in 64-bit.
+        grid = Grid(shape=(4, 5, 6))
+
+        u = TimeFunction(name='u', grid=grid)
+
+        op = Operator(Eq(u.forward, u + 1), opt=('advanced', {'linearize': True}))
+
+        # The transfer bound is a product of the four `size[i]`, each cast to
+        # `long`; the multiplication is thus carried out in 64-bit arithmetic
+        for transfer in op.body.maps + op.body.unmaps:
+            code = transfer.ccode.value
+            for i in range(4):
+                assert f'(long)(u_vec->size[{i}])' in code
+            # No un-cast `size[i]*` product (which would overflow in 32-bit)
+            assert 'u_vec->size[0]*' not in code
+
 
 class TestPassesEdgeCases:
 
diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py
index d6505a6f71..aba82257ae 100644
--- a/tests/test_gpu_openmp.py
+++ b/tests/test_gpu_openmp.py
@@ -54,14 +54,15 @@ def test_basic(self):
         assert trees[0][1].pragmas[0].ccode.value ==\
             'omp target teams distribute parallel for collapse(3)'
         assert op.body.maps[0].ccode.value ==\
-            ('omp target enter data map(to: u[0:u_vec->size[0]*'
-             'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])')
+            ('omp target enter data map(to: u[0:(long)(u_vec->size[0])*'
+             '(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])])')
         assert op.body.unmaps[0].ccode.value ==\
-            ('omp target update from(u[0:u_vec->size[0]*'
-             'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])')
+            ('omp target update from(u[0:(long)(u_vec->size[0])*'
+             '(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])])')
         assert op.body.unmaps[1].ccode.value ==\
-            ('omp target exit data map(release: u[0:u_vec->size[0]*'
-             'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]]) if(devicerm)')
+            ('omp target exit data map(release: u[0:(long)(u_vec->size[0])*'
+             '(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])]) '
+             'if(devicerm)')
 
         # Currently, advanced-fsg mode == advanced mode
         op1 = Operator(Eq(u.forward, u + 1), language='openmp', opt='advanced-fsg')
@@ -125,14 +126,17 @@ def test_multiple_eqns(self):
             'omp target teams distribute parallel for collapse(3)'
         for i, f in enumerate([u, v]):
             assert op.body.maps[i].ccode.value ==\
-                (f'omp target enter data map(to: {f.name}[0:{f.name}_vec->size[0]*'
-                 f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]])')
+                (f'omp target enter data map(to: {f.name}'
+                 f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*'
+                 f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])])')
             assert op.body.unmaps[2*i + 0].ccode.value ==\
-                (f'omp target update from({f.name}[0:{f.name}_vec->size[0]*'
-                 f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]])')
+                (f'omp target update from({f.name}'
+                 f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*'
+                 f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])])')
             assert op.body.unmaps[2*i + 1].ccode.value ==\
-                (f'omp target exit data map(release: {f.name}[0:{f.name}_vec->size[0]*'
-                 f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]]) '
+                (f'omp target exit data map(release: {f.name}'
+                 f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*'
+                 f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])]) '
                  'if(devicerm)')
 
     def test_multiple_loops(self):

From 74ee8dbb7e45b05abb25be08f058000a5e3c5c0f Mon Sep 17 00:00:00 2001
From: gaoflow <gaobing1230@gmail.com>
Date: Fri, 29 May 2026 10:51:07 +0200
Subject: [PATCH 2/2] compiler: Use as_long for transfer size cast, fix it for
 Indexed leaves

Address review: replace the ad-hoc _avoid_overflow helper with the existing
as_long. as_long only substituted plain Symbols (retrieve_symbols), so it was
a no-op on the IndexedPointer size factors (vec->size[i]) of a linearized
transfer bound; extend it to retrieve_terminals so Indexed/IndexedPointer
leaves are cast too. Keep the cast scoped to Mul products in PragmaTransfer so
non-linearized multi-dimensional sections are not needlessly upcast.
---
 devito/passes/iet/parpragma.py   | 34 +++++++++++---------------------
 devito/symbolics/manipulation.py | 10 +++++++---
 tests/test_gpu_openmp.py         | 24 +++++++++++-----------
 3 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py
index f14f5ccb4d..227d380a5b 100644
--- a/devito/passes/iet/parpragma.py
+++ b/devito/passes/iet/parpragma.py
@@ -17,7 +17,7 @@
 from devito.passes.iet.langbase import (
     DeviceAwareMixin, LangBB, LangTransformer, ShmTransformer, make_sections_from_imask
 )
-from devito.symbolics import INT, LONG
+from devito.symbolics import INT, as_long
 from devito.tools import as_tuple, flatten, is_integer, prod
 from devito.types import Symbol
 
@@ -450,25 +450,6 @@ def make_parallel(self, graph, **kwargs):
         return self._make_parallel(graph, sync_mapper=graph.sync_mapper)
 
 
-def _avoid_overflow(expr):
-    """
-    The bounds of a host-device transfer section may be a product of the
-    Function's per-dimension sizes (e.g. ``size[0]*size[1]*size[2]``), as
-    happens when the transferred data is flattened (linearized). These sizes
-    are 32-bit C ints, so for a sufficiently large Function (more than ~2**31
-    elements) the product overflows `int` before it is used as an array bound,
-    producing a bogus transfer size (see issue #2777). Cast each factor of the
-    product to a 64-bit integer so the multiplication is carried out in 64-bit
-    arithmetic. A cast on the whole product would be too late (the overflow
-    would already have happened), hence each factor is cast individually.
-    Non-product bounds (a single size, an offset, a constant) cannot overflow
-    and are left untouched.
-    """
-    if getattr(expr, 'is_Mul', False):
-        return expr.func(*[LONG(a) for a in expr.args])
-    return expr
-
-
 class PragmaTransfer(Pragma, Transfer):
 
     """
@@ -510,8 +491,17 @@ def expr_symbols(self):
 
     @cached_property
     def _generate(self):
-        # Stringify sections
-        sections = ''.join([f'[{ccode(_avoid_overflow(i))}:{ccode(_avoid_overflow(j))}]'
+        # Stringify sections. When a transfer is linearized, the section size
+        # is a product of the Function's 32-bit per-dimension sizes (e.g.
+        # `size[0]*size[1]*size[2]`); for a Function with more than ~2**31
+        # elements this product overflows `int` before being used as an array
+        # bound, producing a bogus transfer size (#2777). Promote such products
+        # to 64-bit with `as_long` so the multiplication is carried out in
+        # 64-bit arithmetic. Non-product bounds (a single size, an offset, a
+        # constant) cannot overflow and are left untouched.
+        def cast(e):
+            return as_long(e) if getattr(e, 'is_Mul', False) else e
+        sections = ''.join([f'[{ccode(cast(i))}:{ccode(cast(j))}]'
                             for i, j in self.sections])
         arguments = [ccode(i) for i in self.arguments]
         return self.pragma % (self.function.name, sections, *arguments)
diff --git a/devito/symbolics/manipulation.py b/devito/symbolics/manipulation.py
index 57d9314e16..a212053a07 100644
--- a/devito/symbolics/manipulation.py
+++ b/devito/symbolics/manipulation.py
@@ -11,7 +11,8 @@
 from devito.symbolics.extended_dtypes import LONG
 from devito.symbolics.extended_sympy import DefFunction, rfunc
 from devito.symbolics.queries import q_leaf
-from devito.symbolics.search import retrieve_functions, retrieve_indexed, retrieve_symbols
+from devito.symbolics.search import (retrieve_functions, retrieve_indexed,
+                                     retrieve_terminals)
 from devito.symbolics.unevaluation import Add as UnevalAdd
 from devito.symbolics.unevaluation import Mul as UnevalMul
 from devito.symbolics.unevaluation import Pow as UnevalPow
@@ -547,7 +548,10 @@ def as_long(expr):
     Convert an expression and its symbolic args to a long integer.
     """
     try:
-        syms = retrieve_symbols(expr)
-        return expr.subs({s: LONG(s) for s in syms})
+        # Cast every symbolic leaf, including Indexeds and IndexedPointers
+        # (e.g. ``vec->size[i]``), not just plain Symbols, so that products of
+        # such leaves are evaluated in 64-bit arithmetic
+        terminals = retrieve_terminals(expr)
+        return expr.subs({s: LONG(s) for s in terminals})
     except AttributeError:
         return LONG(expr)
diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py
index aba82257ae..98bc8e170a 100644
--- a/tests/test_gpu_openmp.py
+++ b/tests/test_gpu_openmp.py
@@ -54,14 +54,14 @@ def test_basic(self):
         assert trees[0][1].pragmas[0].ccode.value ==\
             'omp target teams distribute parallel for collapse(3)'
         assert op.body.maps[0].ccode.value ==\
-            ('omp target enter data map(to: u[0:(long)(u_vec->size[0])*'
-             '(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])])')
+            ('omp target enter data map(to: u[0:(long)(u_vec->size[3])*'
+             '(long)(u_vec->size[2])*(long)(u_vec->size[1])*(long)(u_vec->size[0])])')
         assert op.body.unmaps[0].ccode.value ==\
-            ('omp target update from(u[0:(long)(u_vec->size[0])*'
-             '(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])])')
+            ('omp target update from(u[0:(long)(u_vec->size[3])*'
+             '(long)(u_vec->size[2])*(long)(u_vec->size[1])*(long)(u_vec->size[0])])')
         assert op.body.unmaps[1].ccode.value ==\
-            ('omp target exit data map(release: u[0:(long)(u_vec->size[0])*'
-             '(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])]) '
+            ('omp target exit data map(release: u[0:(long)(u_vec->size[3])*'
+             '(long)(u_vec->size[2])*(long)(u_vec->size[1])*(long)(u_vec->size[0])]) '
              'if(devicerm)')
 
         # Currently, advanced-fsg mode == advanced mode
@@ -127,16 +127,16 @@ def test_multiple_eqns(self):
         for i, f in enumerate([u, v]):
             assert op.body.maps[i].ccode.value ==\
                 (f'omp target enter data map(to: {f.name}'
-                 f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*'
-                 f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])])')
+                 f'[0:(long)({f.name}_vec->size[3])*(long)({f.name}_vec->size[2])*'
+                 f'(long)({f.name}_vec->size[1])*(long)({f.name}_vec->size[0])])')
             assert op.body.unmaps[2*i + 0].ccode.value ==\
                 (f'omp target update from({f.name}'
-                 f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*'
-                 f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])])')
+                 f'[0:(long)({f.name}_vec->size[3])*(long)({f.name}_vec->size[2])*'
+                 f'(long)({f.name}_vec->size[1])*(long)({f.name}_vec->size[0])])')
             assert op.body.unmaps[2*i + 1].ccode.value ==\
                 (f'omp target exit data map(release: {f.name}'
-                 f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*'
-                 f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])]) '
+                 f'[0:(long)({f.name}_vec->size[3])*(long)({f.name}_vec->size[2])*'
+                 f'(long)({f.name}_vec->size[1])*(long)({f.name}_vec->size[0])]) '
                  'if(devicerm)')
 
     def test_multiple_loops(self):