diff --git a/devito/ir/cgen/printer.py b/devito/ir/cgen/printer.py index 3f87496c62..dec3792b31 100644 --- a/devito/ir/cgen/printer.py +++ b/devito/ir/cgen/printer.py @@ -212,7 +212,12 @@ def _print_Pow(self, expr): suffix = self.func_literal(expr) base = self._print(expr.base) if equal_valued(expr.exp, -1): - return self._print_Float(Float(1.0)) + '/' + \ + # Pick the literal precision from this Pow's dtype rather than + # the printer default, so e.g. ``Pow(DOUBLE(h), -1)`` emits + # ``1.0/(double)h`` not ``1.0F/(double)h``. This branch only + # fires when the surrounding Mul printer chose not to group + # the Pow into a denominator (e.g. lone reciprocal). + return f'1.0{self.prec_literal(expr)}/' + \ self.parenthesize(expr.base, PREC) elif equal_valued(expr.exp, 0.5): return f'{self.ns(expr)}sqrt{suffix}({base})' diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index cbf206b3ff..8d880501e2 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -4,7 +4,7 @@ import numpy as np -from devito.ir.equations import ClusterizedEq +from devito.ir.equations import clusterize_eq from devito.ir.support import ( PARALLEL, PARALLEL_IF_PVT, BaseGuardBoundNext, DataSpace, Forward, Guards, Interval, IntervalGroup, IterationSpace, PrefetchUpdate, Properties, Scope, WaitLock, WithLock, @@ -50,7 +50,7 @@ class Cluster: def __init__(self, exprs, ispace=null_ispace, guards=None, properties=None, syncs=None, halo_scheme=None): - self._exprs = tuple(ClusterizedEq(e, ispace=ispace) for e in as_tuple(exprs)) + self._exprs = tuple(clusterize_eq(e, ispace=ispace) for e in as_tuple(exprs)) self._ispace = ispace self._guards = Guards(guards or {}) self._syncs = normalize_syncs(syncs or {}) diff --git a/devito/ir/equations/equation.py b/devito/ir/equations/equation.py index 8d72704b79..43445d47c9 100644 --- a/devito/ir/equations/equation.py +++ b/devito/ir/equations/equation.py @@ -1,4 +1,4 @@ -from functools import cached_property +from functools import cached_property, singledispatch import numpy as np import sympy @@ -11,17 +11,31 @@ ) from devito.symbolics import IntDiv, limits_mapper, uxreplace from devito.tools import Pickable, Tag, frozendict -from devito.types import Eq, Inc, ReduceMax, ReduceMin, ReduceMinMax, relational_min +from devito.types import ( + Eq, Inc, IncrInterpolation, Injection, InjectionMixin, Interpolation, + InterpolationMixin, ReduceMax, ReduceMin, ReduceMinMax, SparseEq, SparseOpMixin, + relational_min +) __all__ = [ 'ClusterizedEq', + 'ClusterizedIncrInterpolation', + 'ClusterizedInjection', + 'ClusterizedInterpolation', + 'ClusterizedSparseEq', 'DummyEq', 'LoweredEq', + 'LoweredIncrInterpolation', + 'LoweredInjection', + 'LoweredInterpolation', + 'LoweredSparseEq', 'OpInc', 'OpMax', 'OpMin', 'OpMinMax', + 'clusterize_eq', 'identity_mapper', + 'lower_eq', ] @@ -30,6 +44,8 @@ class IREq(sympy.Eq, Pickable): __rargs__ = ('lhs', 'rhs') __rkwargs__ = ('ispace', 'conditionals', 'implicit_dims', 'operation') + is_SparseOperation = False + def _hashable_content(self): return (*super()._hashable_content(), *tuple(getattr(self, i) for i in self.__rkwargs__)) @@ -115,16 +131,15 @@ class Operation(Tag): @classmethod def detect(cls, expr): - reduction_mapper = { - Inc: OpInc, - ReduceMax: OpMax, - ReduceMin: OpMin, - ReduceMinMax: OpMinMax - } - try: - return reduction_mapper[type(expr)] - except KeyError: - pass + reduction_mapper = ( + (ReduceMinMax, OpMinMax), + (ReduceMin, OpMin), + (ReduceMax, OpMax), + (Inc, OpInc), + ) + for kls, op in reduction_mapper: + if isinstance(expr, kls): + return op # NOTE: in the future we might want to track down other kinds # of operations here (e.g., memcpy). However, we don't care for @@ -204,8 +219,9 @@ def __new__(cls, *args, **kwargs): accesses = detect_accesses(expr) dimensions = Stencil.union(*accesses.values()) - # Separate out the SubIterators from the main iteration Dimensions, that - # is those which define an actual iteration space + # Separate out the SubIterators from the main iteration + # Dimensions, that is those which define an actual + # iteration space iterators = {} for d in dimensions: if d.is_SubIterator: @@ -271,6 +287,43 @@ def func(self, *args): return self._rebuild(*args, evaluate=False) +class LoweredSparseEq(SparseOpMixin, LoweredEq): + + """ + The IR counterpart of ``SparseEq``: a regular ``LoweredEq`` that + also carries the ``interpolator`` metadata used by the IET pass + ``lower_sparse_ops`` to wrap the resulting ``p_*, rp_*`` iteration + nest in an ElementalFunction. Subclassed by + ``LoweredInterpolation`` / ``LoweredIncrInterpolation`` / + ``LoweredInjection`` for the per-operation polymorphic behaviour. + """ + + __rkwargs__ = LoweredEq.__rkwargs__ + ('interpolator',) + + +class LoweredInterpolation(InterpolationMixin, LoweredSparseEq): + """IR counterpart of ``Interpolation``.""" + pass + + +class LoweredIncrInterpolation(InterpolationMixin, LoweredSparseEq): + """IR counterpart of ``IncrInterpolation``.""" + pass + + +class LoweredInjection(InjectionMixin, LoweredSparseEq): + """IR counterpart of ``Injection``.""" + pass + + +# Map user-level sparse-op classes to their IR-level counterparts. +_lowered_sparse_cls = { + Interpolation: LoweredInterpolation, + IncrInterpolation: LoweredIncrInterpolation, + Injection: LoweredInjection, +} + + class ClusterizedEq(IREq): """ @@ -326,6 +379,41 @@ def __new__(cls, *args, **kwargs): func = IREq._rebuild +class ClusterizedSparseEq(SparseOpMixin, ClusterizedEq): + + """ + Frozen counterpart of ``LoweredSparseEq``: the same regular + ``ClusterizedEq`` augmented with ``interpolator`` so the IET pass + ``lower_sparse_ops`` can identify and rewrite the sparse op's + iteration nest. Subclassed by ``ClusterizedInterpolation`` / + ``ClusterizedIncrInterpolation`` / ``ClusterizedInjection``. + """ + + __rkwargs__ = ClusterizedEq.__rkwargs__ + ('interpolator',) + + +class ClusterizedInterpolation(InterpolationMixin, ClusterizedSparseEq): + """Frozen counterpart of ``LoweredInterpolation``.""" + pass + + +class ClusterizedIncrInterpolation(InterpolationMixin, ClusterizedSparseEq): + """Frozen counterpart of ``LoweredIncrInterpolation``.""" + pass + + +class ClusterizedInjection(InjectionMixin, ClusterizedSparseEq): + """Frozen counterpart of ``LoweredInjection``.""" + pass + + +_clusterized_sparse_cls = { + LoweredInterpolation: ClusterizedInterpolation, + LoweredIncrInterpolation: ClusterizedIncrInterpolation, + LoweredInjection: ClusterizedInjection, +} + + class DummyEq(ClusterizedEq): """ @@ -345,3 +433,62 @@ def __new__(cls, *args, **kwargs): else: raise ValueError(f"Cannot construct DummyEq from args={str(args)}") return ClusterizedEq.__new__(cls, obj, ispace=obj.ispace) + + +@singledispatch +def lower_eq(eq): + """ + Promote a user-level ``Eq`` to its ``LoweredEq`` counterpart, ready + for the cluster pipeline. The dispatch matches the dynamic type of + ``eq``; ``SparseEq`` and friends get their own branch. + """ + return LoweredEq(eq) + + +@lower_eq.register(SparseEq) +def _(eq): + # Augment ``implicit_dims`` with the SparseFunction's own iteration + # Dimensions (e.g. ``p_sf`` and any extra SparseFunction dims) so + # the cluster scheduler sees them. Grid Dimensions reached through + # the rhs Function are deliberately *not* added: SubDomain-derived + # SubDimensions would otherwise spuriously appear in the + # IterationSpace, and grid Dimensions are already discovered via + # the radius ConditionalDimensions in the rhs. + interp = eq.interpolator + sf_dims = tuple(interp.sfunction.dimensions) + user = tuple(eq.implicit_dims or ()) + if interp.sfunction._sparse_position == -1: + augmented = sf_dims + user + else: + augmented = user + sf_dims + + if augmented != tuple(eq.implicit_dims or ()): + eq = eq.func(eq.lhs, eq.rhs, interpolator=interp, + implicit_dims=augmented) + + lowered_cls = _lowered_sparse_cls[type(eq)] + obj = lowered_cls(eq) + obj._interpolator = interp + return obj + + +@singledispatch +def clusterize_eq(eq, **kwargs): + """ + Freeze a ``LoweredEq`` into its ``ClusterizedEq`` counterpart, + suitable for use in a ``Cluster``. Subclasses with extra payload + (e.g. ``LoweredSparseEq``) dispatch to their frozen counterpart. + """ + return ClusterizedEq(eq, **kwargs) + + +@clusterize_eq.register(LoweredSparseEq) +def _(eq, **kwargs): + return _clusterized_sparse_cls[type(eq)](eq, **kwargs) + + +@clusterize_eq.register(ClusterizedSparseEq) +def _(eq, **kwargs): + # ``eq`` is already clusterized; rebuild via its own class to preserve + # the per-operation polymorphic behaviour. + return type(eq)(eq, **kwargs) diff --git a/devito/mpi/halo_scheme.py b/devito/mpi/halo_scheme.py index 3b4bcb5bc8..884460d44c 100644 --- a/devito/mpi/halo_scheme.py +++ b/devito/mpi/halo_scheme.py @@ -134,7 +134,6 @@ class HaloScheme: """ def __init__(self, exprs, ispace): - # Derive the halo exchanges self._mapper = frozendict(classify(exprs, ispace)) # Track the IterationSpace offsets induced by SubDomains/SubDimensions, diff --git a/devito/operations/interpolators.py b/devito/operations/interpolators.py index 6467e65df4..8081312a84 100644 --- a/devito/operations/interpolators.py +++ b/devito/operations/interpolators.py @@ -1,7 +1,6 @@ from abc import ABC, abstractmethod from contextlib import suppress from functools import cached_property, wraps -from itertools import groupby import numpy as np import sympy @@ -15,8 +14,11 @@ from devito.finite_differences.elementary import floor from devito.logger import warning from devito.symbolics import INT, retrieve_function_carriers, retrieve_functions -from devito.tools import Pickable, as_tuple, filter_ordered, flatten, memoized_meth -from devito.types import Eq, Evaluable, Inc, SubFunction, Symbol +from devito.symbolics.extended_dtypes import DOUBLE +from devito.tools import as_tuple, filter_ordered, memoized_meth +from devito.types import ( + Eq, Inc, IncrInterpolation, Injection, Interpolation, SubFunction, Symbol +) from devito.types.utils import DimensionTuple __all__ = ['LinearInterpolator', 'PrecomputedInterpolator', 'SincInterpolator'] @@ -81,106 +83,41 @@ def _extract_subdomain(variables): return None -class UnevaluatedSparseOperation(sympy.Expr, Evaluable, Pickable): - +def _build_interpolation(expr, increment, implicit_dims, self_subs, interpolator): """ - Represents an Injection or an Interpolation operation performed on a - SparseFunction. Evaluates to a list of Eq objects. - - Parameters - ---------- - interpolator : Interpolator - Interpolator object that will be used to evaluate the operation. - callback : callable - A routine generating the symbolic expressions for the operation. + Construct the sparse-op Eq for an interpolation: the synthetic Eq + is ``Eq(sf[..., p_*], expr[..., rp_*])``; with ``increment`` it is + an ``Inc``. User-supplied ``implicit_dims`` are carried as-is; the + SparseFunction's iteration Dimensions are augmented in by + ``lower_eq`` so the cluster pipeline sees them. """ + eq = interpolator._interpolate(expr=expr, increment=increment, + self_subs=self_subs, + implicit_dims=None) + cls = IncrInterpolation if isinstance(eq, Inc) else Interpolation + return cls(eq.lhs, eq.rhs, interpolator=interpolator, + implicit_dims=implicit_dims) - subdomain = None - __rargs__ = ('interpolator',) - - def __new__(cls, interpolator): - obj = super().__new__(cls) - - obj.interpolator = interpolator - - return obj - - def _evaluate(self, **kwargs): - return_value = self.operation(**kwargs) - assert(all(isinstance(i, Eq) for i in return_value)) - return return_value - - @abstractmethod - def operation(self, **kwargs): - pass - - def __add__(self, other): - return flatten([self, other]) - - def __radd__(self, other): - return flatten([other, self]) - - -class Interpolation(UnevaluatedSparseOperation): - - """ - Represents an Interpolation operation performed on a SparseFunction. - Evaluates to a list of Eq objects. - """ - - __rargs__ = ('expr', 'increment', 'implicit_dims', 'self_subs') + \ - UnevaluatedSparseOperation.__rargs__ - - def __new__(cls, expr, increment, implicit_dims, self_subs, interpolator): - obj = super().__new__(cls, interpolator) - - # TODO: unused now, but will be necessary to compute the adjoint - obj.expr = expr - obj.increment = increment - obj.self_subs = self_subs - obj.implicit_dims = implicit_dims - - return obj - - def operation(self, **kwargs): - return self.interpolator._interpolate(expr=self.expr, increment=self.increment, - self_subs=self.self_subs, - implicit_dims=self.implicit_dims) - - def __repr__(self): - return (f"Interpolation({repr(self.expr)} into " - f"{repr(self.interpolator.sfunction)})") - - __str__ = __repr__ - - -class Injection(UnevaluatedSparseOperation): +def _build_injection(field, expr, implicit_dims, interpolator): """ - Represents an Injection operation performed on a SparseFunction. - Evaluates to a list of Eq objects. + Construct the ``Injection``(s) for an injection: each synthetic Eq + is ``Inc(field[..., x, y, ...], weights * expr[..., rp_*])`` + produced by ``interpolator._inject``. A multi-field injection + expands into one ``Injection`` per ``(field, expr)`` pair so each + target field is individually visible to the cluster pipeline. + User-supplied ``implicit_dims`` are carried as-is; sparse-function + iteration Dimensions are augmented in by ``lower_eq``. """ - - __rargs__ = ('field', 'expr', 'implicit_dims') + UnevaluatedSparseOperation.__rargs__ - - def __new__(cls, field, expr, implicit_dims, interpolator): - obj = super().__new__(cls, interpolator) - - # TODO: unused now, but will be necessary to compute the adjoint - obj.field = field - obj.expr = expr - obj.implicit_dims = implicit_dims - - return obj - - def operation(self, **kwargs): - return self.interpolator._inject(expr=self.expr, field=self.field, - implicit_dims=self.implicit_dims) - - def __repr__(self): - return f"Injection({repr(self.expr)} into {repr(self.field)})" - - __str__ = __repr__ + fields, exprs = as_tuple(field), as_tuple(expr) + if len(exprs) == 1: + exprs = tuple(exprs[0] for _ in fields) + eqs = [] + for (f, e) in zip(fields, exprs, strict=True): + inc = interpolator._inject(field=f, expr=e, implicit_dims=None) + eqs.append(Injection(inc.lhs, inc.rhs, interpolator=interpolator, + implicit_dims=implicit_dims)) + return eqs[0] if len(eqs) == 1 else eqs class GenericInterpolator(ABC): @@ -265,14 +202,21 @@ def _rdim(self, subdomain=None, shifts=None): else: gdims = self._gdims - # Make radius dimension conditional to avoid OOB + # Make the radius dimensions conditional to avoid OOB accesses. + # ``rd ∈ [-r+1, r]`` and the access is ``field[pos + rd]``, so + # the OOB guard on ``rd`` reads ``pos + rd ∈ [d_min - r, d_max + r]``. rdims = [] - pos = self.sfunction._position_map(shifts=shifts).values() - - for (d, rd, p) in zip(gdims, self._cdim, pos, strict=True): - # Add conditional to avoid OOB - lb = sympy.And(rd + p >= d.symbolic_min - self.r, evaluate=False) - ub = sympy.And(rd + p <= d.symbolic_max + self.r, evaluate=False) + pos_symbols = self.sfunction._pos_symbols(shifts=shifts) + + for d in gdims: + # The radius CustomDimension is keyed on the grid Dimension + # (``d.root``) so a SubDomain-restricted operation reuses + # the same ``rp_*`` dim as the full-grid case; only the + # bounds carry the SubDomain's symbolic_min/max. + rd = self.sfunction._crdim(d.root) + pos = pos_symbols[d.root] + lb = sympy.And(pos + rd >= d.symbolic_min - self.r, evaluate=False) + ub = sympy.And(pos + rd <= d.symbolic_max + self.r, evaluate=False) cond = sympy.And(lb, ub, evaluate=False) # Insert a check to catch cases where interpolation/injection is @@ -314,42 +258,91 @@ def _augment_implicit_dims(self, implicit_dims, extras=None): def _coeff_temps(self, implicit_dims, shifts=None): return [] - def _positions(self, implicit_dims, shifts=None): - return [Eq(v, INT(floor(k)), implicit_dims=implicit_dims) - for k, v in self.sfunction._position_map(shifts=shifts).items()] - - def _interp_idx(self, variables, implicit_dims=None, subdomain=None, - shifts=None): + @memoized_meth + def _raw_pos_symbols(self, shifts=None): + """ + Per-Dimension Symbol holding the unrounded grid-relative position + ``(coord - origin - shift)/h``. Both the integer position + (``floor(...)``) and the linear-interp fractional part + (``... - floor(...)``) reuse this Symbol so the divide-and-shift + expression is emitted only once per sparse point. """ - Generate interpolation indices for the DiscreteFunctions in `variables`. + dtype = self.sfunction.coordinates.dtype + symbols = [] + for d, s in zip(self.grid.dimensions, + shifts or (0,) * len(self.grid.dimensions), + strict=True): + suffix = '_s1' if s != 0 else '' + symbols.append(Symbol(name=f'rpos{d}{suffix}', dtype=dtype)) + return DimensionTuple(*symbols, getters=self.grid.dimensions) - `shifts` is a per-Dimension physical offset for the target field's - origin: it only affects the integer position symbol via the position - map (`pos = floor((c - o - shift)/h)`). The index substitution itself - is unchanged — any staggered offset in a field's own symbolic access is - absorbed by Devito's normal indexing. + def _positions(self, implicit_dims, shifts=None): + # The ``(coord - origin)/h`` subtract is the only step that can lose + # precision to catastrophic cancellation when ``coord`` and ``origin`` + # are large and close to each other (e.g. an origin-shifted survey). + # Promote ``origin`` and ``h`` to float64 so the subtract and divide + # happen in double precision in C (one cast operand promotes the + # whole expression); the result narrows to the field dtype on store + # to ``rpos*`` so downstream ``floor`` / fractional math stays in + # the field dtype. + rposs = self._raw_pos_symbols(shifts=shifts) + subs = {o: DOUBLE(o) for o in self.grid.origin_symbols} + subs.update({d.spacing: DOUBLE(d.spacing) for d in self._gdims}) + return [Eq(rposs[d], k.xreplace(subs), implicit_dims=implicit_dims) + for d, k in zip(self._gdims, + self.sfunction._position_map(shifts=shifts), + strict=True)] + \ + [Eq(v, INT(floor(rposs[d])), implicit_dims=implicit_dims) + for d, v in zip(self._gdims, + self.sfunction._position_map(shifts=shifts).values(), + strict=True)] + + def sparse_temps(self, rhs, implicit_dims, field=None): """ - pos = self.sfunction._position_map(shifts=shifts).values() + Position/coefficient temps for a sparse op with right-hand side + ``rhs``. For an injection, ``field`` drives the per-Dimension + shifts so the temps' lhs (``pos*`` symbols) match the rhs of a + staggered injection; for an interpolation, ``field`` is None + and no shifts are applied. + """ + if field is not None: + extras = [field] + list(retrieve_function_carriers(rhs)) + shifts = self._field_shifts(field) + else: + extras = list(retrieve_function_carriers(rhs)) or None + shifts = None - # Temporaries for the position - temps = self._positions(implicit_dims, shifts=shifts) + implicit_dims = self._augment_implicit_dims(implicit_dims, extras=extras) + return list(self._positions(implicit_dims, shifts=shifts)) + \ + list(self._coeff_temps(implicit_dims, shifts=shifts)) - # Coefficient symbol expression - temps.extend(self._coeff_temps(implicit_dims, shifts=shifts)) + def _interp_idx(self, variables, subdomain=None, shifts=None): + """ + Generate the indirect-access index substitutions for the + DiscreteFunctions in ``variables``. Each grid Dimension ``x`` + is replaced with ``pos_x + rd_x``, where ``rd_x`` is the + StencilDimension for the radius and ``pos_x`` is the per-point + position offset; together they realise the radius-neighbourhood + access ``field[pos_x + rd_x, pos_y + rd_y]``. + + ``shifts`` is a per-Dimension physical offset for the target + field's origin (e.g. ``h_x/2`` for a field staggered in ``x``); + it only affects the integer position symbol via the position + map (``pos = floor((c - o - shift)/h)``). + """ + pos = self.sfunction._position_map(shifts=shifts).values() # Substitution mapper for variables mapper = self._rdim(subdomain=subdomain, shifts=shifts).getters # Index substitution to make in variables subs = { - ki: c + p + ki: p + c for ((k, c), p) in zip(mapper.items(), pos, strict=True) for ki in {k, k.root} } - idx_subs = {v: v.subs(subs) for v in variables} - - return idx_subs, temps + return {v: v.subs(subs) for v in variables} @check_radius @check_coords @@ -370,7 +363,7 @@ def interpolate(self, expr, increment=False, self_subs=None, implicit_dims=None) """ if self_subs is None: self_subs = {} - return Interpolation(expr, increment, implicit_dims, self_subs, self) + return _build_interpolation(expr, increment, implicit_dims, self_subs, self) @check_radius @check_coords @@ -389,125 +382,83 @@ def inject(self, field, expr, implicit_dims=None): injection expression, but that should be honored when constructing the operator. """ - return Injection(field, expr, implicit_dims, self) + return _build_injection(field, expr, implicit_dims, self) def _interpolate(self, expr, increment=False, self_subs=None, implicit_dims=None): """ - Generate equations interpolating an arbitrary expression into `self`. + Build the synthetic single Eq for an interpolation: - Parameters - ---------- - expr : expr-like - Input expression to interpolate. - increment: bool, optional - If True, generate increments (Inc) rather than assignments (Eq). - implicit_dims : Dimension or list of Dimension, optional - An ordered list of Dimensions that do not explicitly appear in the - interpolation expression, but that should be honored when constructing - the operator. + Eq(sf[..., p_*], expr[..., rp_*]) # or Inc when increment=True + + The grid Dimensions inside ``expr`` are substituted for the + radius ConditionalDimensions ``rp_*``, whose parent is the + original grid Dimension. The cluster scheduler therefore derives + an IterationSpace ``(..., p_*, rp_*)``; the IET pass + ``lower_sparse_ops`` later wraps that nest in an + ElementalFunction and inserts the position/weight temps and + accumulator inside it. """ - # Derivatives must be evaluated before the introduction of indirect accesses - with suppress(AttributeError): - expr = expr._eval_at(self.sfunction).evaluate + # Derivatives must be evaluated before the introduction of indirect accesses. + # CSE will pick up any shared subexpression. + try: + _expr = expr._eval_at(self.sfunction).evaluate + except AttributeError: + _expr = expr if self_subs is None: self_subs = {} - variables = list(retrieve_function_carriers(expr)) + variables = list(retrieve_function_carriers(_expr)) subdomain = _extract_subdomain(variables) - # Implicit dimensions implicit_dims = self._augment_implicit_dims(implicit_dims, variables) + idx_subs = self._interp_idx(variables, subdomain=subdomain) - # List of indirection indices for all adjacent grid points - idx_subs, temps = self._interp_idx(variables, implicit_dims=implicit_dims, - subdomain=subdomain) - - # Accumulate point-wise contributions into a temporary - rhs = Symbol(name=f'sum{self.sfunction.name}', dtype=self.sfunction.dtype) - summands = [Eq(rhs, 0., implicit_dims=implicit_dims)] - # Substitute coordinate base symbols into the interpolation coefficients - weights = self._weights(subdomain=subdomain) - summands.extend([Inc(rhs, (weights * expr).xreplace(idx_subs), - implicit_dims=implicit_dims)]) - - # Write/Incr `self` lhs = self.sfunction.subs(self_subs) - ecls = Inc if increment else Eq - last = [ecls(lhs, rhs, implicit_dims=implicit_dims)] + rhs = _expr.xreplace(idx_subs) - return temps + summands + last + ecls = Inc if increment else Eq + return ecls(lhs, rhs, implicit_dims=implicit_dims) def _inject(self, field, expr, implicit_dims=None): """ - Generate equations injecting an arbitrary expression into a field. + Build the synthetic single Inc for an injection: - Parameters - ---------- - field : Function - Input field into which the injection is performed. - expr : expr-like - Injected expression. - implicit_dims : Dimension or list of Dimension, optional - An ordered list of Dimensions that do not explicitly appear in the - injection expression, but that should be honored when constructing - the operator. - """ - # Make iterable to support inject((u, v), expr=expr) - # or inject((u, v), expr=(expr1, expr2)) - fields, exprs = as_tuple(field), as_tuple(expr) - - # Provide either one expr per field or on expr for all fields - if len(fields) > 1: - if len(exprs) == 1: - exprs = tuple(exprs[0] for _ in fields) - else: - assert len(exprs) == len(fields) - - subdomain = _extract_subdomain(fields) + Inc(field[..., pos_x + rd_x, ...], weights(rd_*) * expr[..., pos + rd_*]) + Both lhs and rhs share the radius-indexed access pattern so the + cluster scheduler derives the same ``(..., p_*, rd_*)`` + IterationSpace whether the operation is interpolation or + injection. The IET pass ``lower_sparse_ops`` wraps the resulting + nest in an ElementalFunction. + """ # Derivatives must be evaluated before the introduction of indirect # accesses. Variables are sampled at their own grid location; the # position map for the target field carries the staggering so the # field's stencil neighbors land on the right indices. - with suppress(AttributeError): - exprs = tuple(e._eval_at(f).evaluate - for e, f in zip(exprs, fields, strict=True)) + try: + _expr = expr._eval_at(field).evaluate + except AttributeError: + _expr = expr - eqns = [] - temps = [] - # We need to create one set of equations (temps and and coeffs) per staggering - # field in which we inject as the reference index depends on the field's origin - for _, g in groupby(zip(fields, exprs, strict=True), lambda f: f[0].staggered): - g_fields, g_exprs = zip(*g, strict=True) - variables = list(v for e in g_exprs for v in retrieve_function_carriers(e)) + subdomain = _extract_subdomain((field,)) - implicit_dims = self._augment_implicit_dims(implicit_dims, variables) + variables = list(retrieve_function_carriers(_expr)) - # All fields in a single injection share the same staggering by - # construction (they are written together at the same indices), so - # derive shifts from the first field. - shifts = self._field_shifts(g_fields[0]) + implicit_dims = self._augment_implicit_dims(implicit_dims, + (field,) + tuple(variables)) - # Move all temporaries inside inner loop to improve parallelism - # Can only be done for inject as interpolation needs a summing temp - # that wouldn't allow collapsing - implicit_dims = implicit_dims + tuple(r.parent for r in - self._rdim(subdomain=subdomain, - shifts=shifts)) + # The reference index depends on the field's origin (staggering). + shifts = self._field_shifts(field) - # List of indirection indices for all adjacent grid points - idx_subs, _temps = self._interp_idx(list(g_fields) + variables, - implicit_dims=implicit_dims, - subdomain=subdomain, shifts=shifts) + idx_subs = self._interp_idx((field,) + tuple(variables), + subdomain=subdomain, shifts=shifts) - w = self._weights(subdomain=subdomain, shifts=shifts) - temps.extend(_temps) - eqns.extend([Inc(f.xreplace(idx_subs), (w * e).xreplace(idx_subs), - implicit_dims=implicit_dims) - for f, e in zip(g_fields, g_exprs, strict=True)]) + weights = self._weights(subdomain=subdomain, shifts=shifts) + lhs = field.xreplace(idx_subs) + rhs = (weights * _expr).xreplace(idx_subs) - return filter_ordered(temps) + eqns + return Inc(lhs, rhs, implicit_dims=implicit_dims) class LinearInterpolator(WeightedInterpolator): @@ -543,13 +494,14 @@ def _point_symbols(self, shifts=None): return DimensionTuple(*symbols, getters=self.grid.dimensions) def _coeff_temps(self, implicit_dims, shifts=None): - # Positions - pmap = self.sfunction._position_map(shifts=shifts) + # The fractional part of the unrounded position; reuse the + # ``rpos*`` Symbols emitted by ``_positions`` rather than the full + # ``(c - o)/h`` expression so the divide is computed only once. + rposs = self._raw_pos_symbols(shifts=shifts) psyms = self._point_symbols(shifts) - poseq = [Eq(psyms[d], pos - floor(pos), - implicit_dims=implicit_dims) - for (d, pos) in zip(self._gdims, pmap.keys(), strict=True)] - return poseq + return [Eq(psyms[d], rposs[d] - floor(rposs[d]), + implicit_dims=implicit_dims) + for d in self._gdims] class PrecomputedInterpolator(WeightedInterpolator): @@ -580,7 +532,10 @@ def interpolation_coeffs(self): @memoized_meth def _weights(self, subdomain=None, shifts=None): ddim, cdim = self.interpolation_coeffs.dimensions[1:] - mappers = [{ddim: ri, cdim: rd-rd.parent.symbolic_min} + # The radius CustomDim spans ``[-r+1, r]``; the coefficients + # table is indexed from 0 to ``2r-1``, so shift by ``r-1``. + offset = self.r - 1 + mappers = [{ddim: ri, cdim: rd + offset} for (ri, rd) in enumerate(self._rdim(subdomain=subdomain, shifts=shifts))] return Mul(*[self.interpolation_coeffs.subs(mapper) @@ -617,12 +572,13 @@ def __init__(self, sfunction): def interpolation_coeffs(self): coeffs = [] shape = (self.sfunction.npoint, 2 * self.r) - for r in self._cdim: - dimensions = (self.sfunction._sparse_dim, r) - sf = SubFunction(name=f"wsinc{r.name}", dtype=self.sfunction.dtype, + for d in self._gdims: + rd = self.sfunction._crdim(d) + dimensions = (self.sfunction._sparse_dim, rd) + sf = SubFunction(name=f"wsinc{rd.name}", dtype=self.sfunction.dtype, shape=shape, dimensions=dimensions, space_order=0, alias=self.sfunction.alias, - parent=None) + parent=self.sfunction) coeffs.append(sf) return tuple(coeffs) diff --git a/devito/operator/operator.py b/devito/operator/operator.py index bfba76b593..c2b2e32ede 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -19,7 +19,7 @@ CompilationError, ExecutionError, InvalidArgument, InvalidOperator ) from devito.ir.clusters import ClusterGroup, clusterize -from devito.ir.equations import LoweredEq, concretize_subdims, lower_exprs +from devito.ir.equations import concretize_subdims, lower_eq, lower_exprs from devito.ir.iet import ( Callable, CInterface, DeviceFunction, EntryFunction, FindSymbols, MetaCall, derive_parameters, iet_build @@ -33,7 +33,7 @@ from devito.parameters import configuration from devito.passes import ( Graph, error_mapper, generate_implicit, generate_macros, is_on_device, lower_dtypes, - lower_index_derivatives, minimize_symbols, optimize_pows, unevaluate + lower_index_derivatives, lower_sparse_ops, minimize_symbols, optimize_pows, unevaluate ) from devito.symbolics import estimate_cost, subs_op_args from devito.tools import ( @@ -192,7 +192,10 @@ def _check_kwargs(cls, **kwargs): @classmethod def _sanitize_exprs(cls, expressions, **kwargs): - expressions = as_tuple(expressions) + # Flatten so callers can mix nested lists (e.g. ``free_surface`` + # returning a list, or multi-field injections returning one + # ``SparseEq`` per field) without having to manually unpack them. + expressions = tuple(flatten(as_tuple(expressions))) for i in expressions: if not isinstance(i, Evaluable): @@ -362,12 +365,9 @@ def _lower_exprs(cls, expressions, **kwargs): # "True" lowering (indexification, shifting, ...) expressions = lower_exprs(expressions, **kwargs) - - # Turn user-defined SubDimensions into concrete SubDimensions, - # in particular uniqueness across expressions is ensured expressions = concretize_subdims(expressions, **kwargs) - processed = [LoweredEq(i) for i in expressions] + processed = [lower_eq(i) for i in expressions] return processed @@ -490,8 +490,12 @@ def _lower_iet(cls, uiet, **kwargs): parameters = derive_parameters(uiet, True) iet = EntryFunction(name, uiet, 'int', parameters, ()) - # Lower IET to a target-specific IET + # Materialise the sparse-op iteration nests into ElementalFunctions + # before target specialisation, so the position/coefficient temps + # the IET pass emits are visible to downstream passes (e.g. the + # data-transfer placement on device). graph = Graph(iet, **kwargs) + lower_sparse_ops(graph, **kwargs) graph = cls._specialize_iet(graph, **kwargs) # Instrument the IET for C-level profiling diff --git a/devito/passes/clusters/buffering.py b/devito/passes/clusters/buffering.py index 2a8c78e7fc..541840602c 100644 --- a/devito/passes/clusters/buffering.py +++ b/devito/passes/clusters/buffering.py @@ -500,6 +500,9 @@ def key(c): raise CompilationError("Unsupported `buffering` over different " "IterationSpaces") + if not ispaces: + return IterationSpace(Interval(self.dim)) + return ispaces.pop() @cached_property diff --git a/devito/passes/iet/__init__.py b/devito/passes/iet/__init__.py index 1cdb97c794..56f2106795 100644 --- a/devito/passes/iet/__init__.py +++ b/devito/passes/iet/__init__.py @@ -1,5 +1,6 @@ from .engine import * # noqa from .misc import * # noqa +from .sparse import * # noqa from .orchestration import * # noqa from .mpi import * # noqa from .definitions import * # noqa diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index 9b936fba76..d4cb2d28e8 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -81,16 +81,29 @@ def __init__(self, iet, options=None, sregistry=None, **kwargs): writes = FindSymbols('writes').visit(iet) self.writes_input = frozenset(f for f in writes if f.is_Input) - # All symbols requiring host-device data transfers when running - # on device - self.data_movs = rmovs, wmovs = set(), set() - gpu_fit = (options or {}).get('gpu-fit', ()) - for i in FindNodes(ExprStmt).visit(iet): - wmovs.update({w for w in i.writes - if needs_transfer(w, gpu_fit)}) - for i in FindNodes(ExprStmt).visit(iet): - rmovs.update({f for f in i.functions - if needs_transfer(f, gpu_fit) and f not in wmovs}) + self._gpu_fit = (options or {}).get('gpu-fit', ()) + + @property + def data_movs(self): + """ + ``(reads, writes)`` of symbols requiring host-device data transfers. + + Recomputed on access from the current state of the Graph (root + plus every reachable efunc) so passes that introduce ExprStmts + after construction — e.g. ``lower_sparse_ops`` materialising the + sparse-op position temps inside an efunc — are reflected. + """ + rmovs, wmovs = set(), set() + for efunc in self.efuncs.values(): + for i in FindNodes(ExprStmt).visit(efunc): + wmovs.update(w for w in i.writes + if needs_transfer(w, self._gpu_fit)) + for efunc in self.efuncs.values(): + for i in FindNodes(ExprStmt).visit(efunc): + rmovs.update(f for f in i.functions + if needs_transfer(f, self._gpu_fit) + and f not in wmovs) + return rmovs, wmovs @property def root(self): diff --git a/devito/passes/iet/misc.py b/devito/passes/iet/misc.py index 0a631bf3a2..68fff9da95 100644 --- a/devito/passes/iet/misc.py +++ b/devito/passes/iet/misc.py @@ -46,7 +46,10 @@ def avoid_denormals(iet, platform=None, **kwargs): except AttributeError: return iet, {} - if iet.is_ElementalFunction: + # Denormals only need flushing once at the program entry; sibling + # Callables (e.g. ElementalFunctions emitted by lower_sparse_ops) + # inherit the SSE state from the caller + if not isinstance(iet, EntryFunction): return iet, {} header = (cgen.Comment('Flush denormal numbers to zero in hardware'), diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index d5752fec52..d0d9a9afd1 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -130,6 +130,16 @@ def _make_simd(self, iet): if any(i.is_Indexed for i in reductions): continue + # A scalar reduction in the loop body (e.g. ``acc += ...`` with + # ``acc`` a Symbol, as emitted by the interpolation accumulator + # pattern) is a cross-iteration dependence the SIMD pragma alone + # can't express -- it would need a ``reduction(+:acc)`` clause, + # which we don't emit here. Without it, ``_Complex`` accumulators + # are miscompiled by some gcc releases. + exprs = FindNodes(Expression).visit(candidate) + if any(e.is_reduction and not e.output.is_Indexed for e in exprs): + continue + # Add SIMD pragma simd = self._make_simd_pragma(candidate) pragmas = candidate.pragmas + simd diff --git a/devito/passes/iet/sparse.py b/devito/passes/iet/sparse.py new file mode 100644 index 0000000000..5556240559 --- /dev/null +++ b/devito/passes/iet/sparse.py @@ -0,0 +1,256 @@ +""" +IET pass that lowers sparse-operation expressions (Interpolation/Injection) +into ElementalFunctions and replaces the iteration nest produced by the +cluster pipeline with a Call to the resulting efunc. + +By the time this pass runs, the cluster pipeline has scheduled each +sparse op into a regular ``(.., p_sf, rd_x, rd_y, ..)`` iteration nest +with a single inner ``Expression`` carrying the synthetic +``LoweredSparseEq``. The pass: + +* finds the outermost ``Iteration`` whose Dimension belongs to the + sparse op (the SparseFunction's sparse Dimension or a radius + Dimension); +* rewrites the body of the sparse Dimension's ``Iteration`` so the + position/coefficient temporaries (``posx``, ``px``, ...) are + computed once per sparse point, above the radius loops; +* for an interpolation, wraps the inner Expression with the scalar + accumulator pattern (``acc = 0``; ``acc += weights * rhs`` inside + the radius loops; ``sf[p_sf] = acc`` afterwards); +* for an injection, leaves the inner Expression as the existing + weighted ``Inc(field[pos+rd], weights * rhs)``; +* wraps the resulting sub-tree in an ``ElementalFunction`` and + replaces it in the parent IET with a ``Call``. +""" + +from collections import OrderedDict + +from devito.ir.equations import DummyEq +from devito.ir.equations.algorithms import lower_exprs +from devito.ir.iet import ( + Call, EntryFunction, Expression, FindNodes, HaloSpot, Increment, Iteration, List, + Transformer, make_callable +) +from devito.passes.iet.engine import iet_pass +from devito.types import Eq, InjectionMixin, InterpolationMixin, Symbol + +__all__ = ['lower_sparse_ops'] + + +@iet_pass +def lower_sparse_ops(iet, sregistry=None, **kwargs): + """ + Replace each sparse-op iteration nest in the IET with a Call to an + ElementalFunction that materialises the position temporaries and + the inner accumulator/increment pattern. + """ + if not isinstance(iet, EntryFunction): + return iet, {} + + # The "head" of a sparse op in the IET is the unique Expression + # whose lhs is the SparseFunction (interpolation) or the target + # field (injection); any auxiliary temporary expressions extracted + # from the original SparseEq (e.g. by ``factorize``/``cse``) are + # left where they are inside the radius nest. + sparse_exprs = [e for e in FindNodes(Expression).visit(iet) + if e.expr.is_SparseOperation + and type(e.expr).is_head_eq(e.expr, + e.expr.interpolator.sfunction)] + if not sparse_exprs: + return iet, {} + + # Group head Expressions by their enclosing outer (sparse-Dimension) + # Iteration; all such Expressions inside the same outer nest share + # one efunc. + groups = OrderedDict() + for expr in sparse_exprs: + nest = _find_outer_iteration(iet, expr) + if nest is None: + continue + groups.setdefault(nest, []).append(expr) + + # ``lower_sparse_ops`` runs before ``optimize_halospots``, so the + # halo-exchange optimiser hasn't yet had a chance to drop the + # reduction-only halo entries that the IR scheduler put around an + # injection nest (e.g. an entry for ``u`` at ``loc_indices={time: + # time+1}`` wrapping ``u[time+1] += ...``). Once the nest becomes a + # Call those expressions are no longer visible to + # ``_drop_reduction_halospots``, so we shed those entries here -- and + # if that empties the HaloSpot, replace it whole so the MPI overlap + # machinery doesn't wrap our Call with stale dynamic-args plumbing. + parents = {nest: _enclosing_halospot(iet, nest) for nest in groups} + + mapper = {} + efuncs = [] + for nest, exprs in groups.items(): + new_nest = _materialise_nest(nest, exprs) + + lse = exprs[0].expr + prefix = f'{lse.efunc_prefix}_{lse.interpolator.sfunction.name}' + efunc = make_callable(sregistry.make_name(prefix=prefix), new_nest) + efuncs.append(efunc) + + call = Call(efunc.name, list(efunc.parameters)) + parent = parents[nest] + if parent is None: + mapper[nest] = call + continue + + # Drop fields that the (now-opaque) Call only writes/increments, + # since the wrapping HaloSpot's purpose was to ensure read-side + # coherency for them and the read no longer exists at the IET + # level. Interpolation reads its target field, so its entries + # stay. + reduced = {e.expr.lhs.function for e in exprs + if isinstance(e.expr, InjectionMixin)} + hs = parent.halo_scheme.drop(reduced) if reduced else parent.halo_scheme + if hs.is_void: + mapper[parent] = call + elif hs is parent.halo_scheme: + mapper[nest] = call + else: + mapper[parent] = parent._rebuild(halo_scheme=hs, body=call) + + if not mapper: + return iet, {} + + return Transformer(mapper).visit(iet), {'efuncs': efuncs} + + +def _find_outer_iteration(iet, expr): + """ + Walk up the IET from ``expr`` and return the outermost Iteration + whose ``dim.root`` is the SparseFunction's sparse Dimension. + """ + sparse_dim = expr.expr.interpolator.sfunction._sparse_dim + for it in FindNodes(Iteration).visit(iet): + if it.dim.root is sparse_dim and expr in FindNodes(Expression).visit(it): + return it + return None + + +def _enclosing_halospot(iet, nest): + """ + Return the HaloSpot directly wrapping ``nest``, if any. + """ + for hs in FindNodes(HaloSpot).visit(iet): + if nest in FindNodes(Iteration).visit(hs): + return hs + return None + + +def _materialise_nest(nest, exprs): + """ + Rewrite the sparse Dimension's Iteration body to compute the + position/coefficient temps once per sparse point, then for any + interpolation Expression wrap it with the scalar accumulator + pattern. Multiple sparse-op Expressions sharing the same outer + Iteration are materialised in one pass and reuse the same temps. + """ + # Position + coefficient temporaries as IET Expressions. These are + # the same for every Expression in the group, so we emit them once. + # The sample's leaf class (Interpolation/Injection) drives whether + # the temps carry staggering shifts. + sample = exprs[0].expr + temp_exprs = tuple(Expression(DummyEq(e.lhs, e.rhs)) + for e in lower_exprs(sample.sparse_temps())) + + # The radius nest is what runs once per sparse point. For each + # interpolation Expression in the group, build its + # accumulator-wrapped copy of the radius nest. Injection Exprs + # share a single copy of the radius nest (their ``Inc`` already + # carries the right ``weights * rhs`` form). + inner = nest.nodes[0] if len(nest.nodes) == 1 else List(body=nest.nodes) + interp_exprs = [e for e in exprs if isinstance(e.expr, InterpolationMixin)] + inject_exprs = [e for e in exprs if isinstance(e.expr, InjectionMixin)] + + body = [] + for expr in interp_exprs: + siblings = [e for e in exprs if e is not expr] + body.append(_interp_inner_block(inner, expr, expr.expr.interpolator, siblings)) + if inject_exprs: + drop = {e: None for e in interp_exprs} + body.append(Transformer(drop, nested=True).visit(inner) if drop else inner) + + return nest._rebuild(nodes=temp_exprs + tuple(body)) + + +def _interp_inner_block(inner, expr, interp, siblings): + """ + Build the accumulator/radius/write-back triple for an interpolation: + + ``Eq(acc, 0)`` + radius_nest with ``Inc(acc, weights * rhs)`` in place of expr + ``Eq(sf[p], acc)`` # ``Inc`` when the SparseEq is a SparseInc + + ``siblings`` are sparse-op Expression nodes in the same radius nest + (e.g. a second interpolation or an injection sharing the outer + sparse Iteration) that must be removed from this copy of the nest + so the accumulator pattern wraps only ``expr``'s increment. + + Any Iteration between the outer sparse Dimension and the radius + Dimensions (e.g. an extra non-grid Dimension on the SparseFunction) + is preserved as an enclosing loop around the accumulator pattern, + so the write-back ``sf[..., d, p_*]`` sees a fresh accumulator per + iteration of ``d``. + """ + eq = expr.expr + sf_lhs = eq.lhs + rhs = eq.rhs + + acc = Symbol(name=f'sum{interp.sfunction.name}', dtype=sf_lhs.dtype) + + # ``rhs`` is already indexified; only the weights need lowering + # (e.g. coefficient Functions are indexed by the radius dim). + # Pull the concretized radius ConditionalDimensions from ``rhs`` + # so the weights inherit the same (renamed) ``cond`` Thicknesses + # and don't leak duplicate ``x_ltkn``/``x_rtkn`` parameters into + # the efunc signature. + rdims_concrete = {d.name: d for d in rhs.free_symbols + if getattr(d, 'is_Conditional', False)} + weights = interp._weights() + if rdims_concrete: + weights = weights.xreplace({ + rd: rdims_concrete[rd.name] + for rd in weights.free_symbols + if getattr(rd, 'is_Conditional', False) and rd.name in rdims_concrete + }) + weighted_rhs = lower_exprs(Eq(acc, weights)).rhs * rhs + + init = Expression(DummyEq(acc, 0)) + inc = Increment(DummyEq(acc, weighted_rhs)) + # Honour the synthetic Eq's flavour: a SparseInc means the user + # asked for ``sf[p_*] += sum`` (interpolation with ``increment=True``); + # otherwise the standard write is ``sf[p_*] = sum``. + write_back_cls = Increment if eq.is_Reduction else Expression + write_back = write_back_cls(DummyEq(sf_lhs, acc)) + + # Single substitution: drop siblings, replace ``expr`` with ``inc``. + mapper = {expr: inc} + mapper.update({s: None for s in siblings}) + + radius_root = _find_radius_root(inner, interp.sfunction) + if radius_root is None or radius_root is inner: + return List(body=(init, + Transformer(mapper, nested=True).visit(inner), + write_back)) + + # Wrap the accumulator pattern around just the radius sub-tree, + # leaving the enclosing non-radius Iterations in place. + wrapped = List(body=(init, Transformer(mapper, nested=True).visit(radius_root), + write_back)) + return Transformer({radius_root: wrapped}, nested=True).visit(inner) + + +def _find_radius_root(inner, sfunction): + """ + Return the outermost Iteration in ``inner`` over a radius + CustomDimension of ``sfunction``, or None if no such Iteration is + found. Radius Dimensions are named ``r`` + (see ``AbstractSparseFunction._crdim``). + """ + prefix = f'r{sfunction._sparse_dim.name}' + for it in FindNodes(Iteration).visit(inner): + if it.dim.name.startswith(prefix): + return it + return None diff --git a/devito/symbolics/extended_sympy.py b/devito/symbolics/extended_sympy.py index 4523fbd2f4..9ea3128753 100644 --- a/devito/symbolics/extended_sympy.py +++ b/devito/symbolics/extended_sympy.py @@ -380,6 +380,8 @@ class UnaryOp(Expr, Pickable, BasicWrapperMixin): _op = '' + is_commutative = True + __rargs__ = ('base',) def __new__(cls, base, **kwargs): diff --git a/devito/symbolics/inspection.py b/devito/symbolics/inspection.py index e19cd89ad1..024791a55b 100644 --- a/devito/symbolics/inspection.py +++ b/devito/symbolics/inspection.py @@ -321,6 +321,14 @@ def sympy_dtype(expr, base=None, default=None, smin=None): with suppress(AttributeError): dtypes.add(i.dtype) + # A ``Cast`` overrides the dtype of the symbol(s) it wraps -- e.g. + # ``DOUBLE(o_x)`` is observably double-typed even though ``o_x`` + # itself is float-typed. Without this, the C printer would pick + # float-precision literals (``1.0F``) when emitting a ``Pow(DOUBLE(h), + # -1)`` inside a wider expression. + for c in expr.atoms(Cast): + dtypes.add(c.dtype) + if not dtypes or not np.issubdtype(base, np.complexfloating): dtypes.update({base} - {None}) diff --git a/devito/types/dense.py b/devito/types/dense.py index a1dc95d75c..cc65d2c194 100644 --- a/devito/types/dense.py +++ b/devito/types/dense.py @@ -1633,8 +1633,11 @@ def _arg_values(self, **kwargs): return self._arg_defaults(alias=self) def _arg_apply(self, *args, **kwargs): + # Parent-owned SubFunction data is computed once and read by the + # Operator; the parent gathers its own data via its own parameter + # entry, so there's nothing for the SubFunction to do here. if self._parent is not None: - return self._parent._arg_apply(*args, **kwargs) + return return super()._arg_apply(*args, **kwargs) @property diff --git a/devito/types/equation.py b/devito/types/equation.py index 5cbba42e89..c0723f2509 100644 --- a/devito/types/equation.py +++ b/devito/types/equation.py @@ -4,10 +4,12 @@ import sympy from devito.deprecations import deprecations -from devito.tools import Pickable, as_tuple, frozendict +from devito.tools import Pickable, as_tuple, flatten, frozendict from devito.types.lazy import Evaluable -__all__ = ['Eq', 'Inc', 'ReduceMax', 'ReduceMin', 'ReduceMinMax'] +__all__ = ['Eq', 'Inc', 'IncrInterpolation', 'Injection', 'InjectionMixin', + 'Interpolation', 'InterpolationMixin', 'ReduceMax', 'ReduceMin', + 'ReduceMinMax', 'SparseEq', 'SparseInc', 'SparseOpMixin'] class Eq(sympy.Eq, Evaluable, Pickable): @@ -58,6 +60,7 @@ class Eq(sympy.Eq, Evaluable, Pickable): """ is_Reduction = False + is_SparseOperation = False __rargs__ = ('lhs', 'rhs') __rkwargs__ = ('subdomain', 'coefficients', 'implicit_dims') @@ -257,3 +260,169 @@ def __new__(cls, lhs, rhs=0, **kwargs): ) return super().__new__(cls, lhs, rhs=rhs, **kwargs) + + +class SparseOpMixin: + + """ + Tagging mixin shared by every sparse-grid operation Eq (whether at + the user, IR, or cluster layer). Carries the ``interpolator`` + metadata needed by the IET pass ``lower_sparse_ops`` to materialise + the radius nest into an ElementalFunction; per-operation behaviour + (interpolation vs. injection) lives on the leaf classes + ``Interpolation`` / ``IncrInterpolation`` / ``Injection`` via + ``InterpolationMixin`` / ``InjectionMixin``. + """ + + is_SparseOperation = True + + @property + def interpolator(self): + return self._interpolator + + +class InterpolationMixin: + + """ + Polymorphic behaviour shared by all sparse-op Eqs representing an + interpolation ``sf[..., p_*] <- expr[..., rp_*]`` (the default + ``Eq`` flavour as ``Interpolation`` and the ``+=`` flavour as + ``IncrInterpolation``). + """ + + efunc_prefix = 'interpolate' + + @property + def field(self): + # An interpolation writes into the SparseFunction, not into a + # grid field, so there is no target field to drive staggering. + return None + + @classmethod + def is_head_eq(cls, eq, sfunction): + # The "head" of an interpolation in the IET is the unique + # Expression whose lhs is the SparseFunction. + return eq.lhs.function is sfunction + + def sparse_temps(self): + """ + Position/coefficient temps to be emitted alongside the radius + nest by the IET pass. + """ + return self.interpolator.sparse_temps(self.rhs, self.implicit_dims) + + +class InjectionMixin: + + """ + Polymorphic behaviour shared by all sparse-op Eqs representing an + injection ``Inc(field[..., pos+rd, ...], weights * rhs)``. + """ + + efunc_prefix = 'inject' + + @property + def field(self): + # An injection writes into a grid field — exposed as ``lhs.function``. + return self.lhs.function + + @classmethod + def is_head_eq(cls, eq, sfunction): + # The "head" of an injection in the IET is the unique + # Expression whose lhs is a (non-sparse) DiscreteFunction. + f = eq.lhs.function + return f.is_DiscreteFunction and f is not sfunction + + def sparse_temps(self): + """ + Position/coefficient temps to be emitted alongside the radius + nest by the IET pass. The target field drives the per-Dimension + shifts so the temps' lhs (``pos*`` symbols) match the rhs of a + staggered injection. + """ + return self.interpolator.sparse_temps(self.rhs, self.implicit_dims, + field=self.field) + + +class SparseEq(SparseOpMixin, Eq): + + """ + An Eq representing a sparse-grid operation. ``SparseEq`` is the + abstract base; instantiate ``Interpolation`` (``sf <- expr``), + ``IncrInterpolation`` (``sf += expr``), or ``Injection`` + (``Inc(field, weights * expr)``) instead. + + The single synthetic Eq uses the SparseFunction's radius + ConditionalDimensions ``rp_*`` (whose parent is the grid Dimension) + as indices into the user expression, so the cluster scheduler + derives a regular IterationSpace including the radius loops. The + cluster pipeline (buffering, halo, snapshotting, ...) treats it + like any other Eq; the IET pass ``lower_sparse_ops`` extracts the + resulting ``p_*, rp_*`` nest and wraps it in an ElementalFunction, + inserting the position/weight temps that drive the radius access. + """ + + __rkwargs__ = Eq.__rkwargs__ + ('interpolator',) + + def __new__(cls, lhs, rhs=0, *, interpolator=None, + implicit_dims=None, **kwargs): + obj = super().__new__(cls, lhs, rhs=rhs, implicit_dims=implicit_dims, + **kwargs) + obj._interpolator = interpolator + return obj + + def __add__(self, other): + # Allow ``list_of_eqs + sparse_eq`` and ``sparse_eq + sparse_eq`` + # to produce a flat list — handy when composing user expression + # bundles passed to ``Operator(...)``. + return flatten([self, other]) + + def __radd__(self, other): + return flatten([other, self]) + + def __repr__(self): + sf = self._interpolator.sfunction + return f"{type(self).__name__}({sf.name})" + + __str__ = __repr__ + + +class SparseInc(SparseEq, Inc): + + """ + The ``Inc`` flavour of ``SparseEq``. The ``+=`` semantics is + required when contributions from multiple sparse points accumulate + into the same target cell (injection) or when the user asks for + ``sf += interp(expr)``. + """ + + pass + + +class Interpolation(InterpolationMixin, SparseEq): + + """ + A standard interpolation ``Eq(sf[..., p_*], expr[..., rp_*])``. + """ + + pass + + +class IncrInterpolation(InterpolationMixin, SparseInc): + + """ + An interpolation with ``+=`` semantics: ``Inc(sf[..., p_*], + expr[..., rp_*])``. Produced by ``interpolate(..., increment=True)``. + """ + + pass + + +class Injection(InjectionMixin, SparseInc): + + """ + An injection ``Inc(field[..., pos+rd, ...], weights(rd_*) * + expr[..., rd_*])``. + """ + + pass diff --git a/devito/types/relational.py b/devito/types/relational.py index 731ec29bc7..10b4b861fd 100644 --- a/devito/types/relational.py +++ b/devito/types/relational.py @@ -37,7 +37,7 @@ class Le(AbstractRel, sympy.Le): rhs : expr-like, optional The right-hand side. Defaults to 0. subdomain : SubDomain, optional - To restrict the evalaution of the relation to a particular sub-region in the + To restrict the evaluation of the relation to a particular sub-region in the computational domain. Examples @@ -72,7 +72,7 @@ class Lt(AbstractRel, sympy.Lt): rhs : expr-like, optional The right-hand side. Defaults to 0. subdomain : SubDomain, optional - To restrict the evalaution of the relation to a particular sub-region in the + To restrict the evaluation of the relation to a particular sub-region in the computational domain. Examples @@ -107,7 +107,7 @@ class Ge(AbstractRel, sympy.Ge): rhs : expr-like, optional The right-hand side. Defaults to 0. subdomain : SubDomain, optional - To restrict the evalaution of the relation to a particular sub-region in the + To restrict the evaluation of the relation to a particular sub-region in the computational domain. Examples @@ -142,7 +142,7 @@ class Gt(AbstractRel, sympy.Gt): rhs : expr-like, optional The right-hand side. Defaults to 0. subdomain : SubDomain, optional - To restrict the evalaution of the relation to a particular sub-region in the + To restrict the evaluation of the relation to a particular sub-region in the computational domain. Examples @@ -177,7 +177,7 @@ class Ne(AbstractRel, sympy.Ne): rhs : expr-like, optional The right-hand side. Defaults to 0. subdomain : SubDomain, optional - To restrict the evalaution of the relation to a particular sub-region in the + To restrict the evaluation of the relation to a particular sub-region in the computational domain. Examples diff --git a/devito/types/sparse.py b/devito/types/sparse.py index e7b528b370..502a7c75d4 100644 --- a/devito/types/sparse.py +++ b/devito/types/sparse.py @@ -375,7 +375,7 @@ def _pos_symbols(self, shifts=None): symbols.append(Symbol(name=f'pos{d}_s1', dtype=np.int32)) else: symbols.append(Symbol(name=f'pos{d}', dtype=np.int32)) - return symbols + return DimensionTuple(*symbols, getters=self.grid.dimensions) @cached_property def _point_increments(self): diff --git a/examples/mpi/overview.ipynb b/examples/mpi/overview.ipynb index b46f49aaf4..99f48c8992 100644 --- a/examples/mpi/overview.ipynb +++ b/examples/mpi/overview.ipynb @@ -43,7 +43,14 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:49.297503Z", + "iopub.status.busy": "2026-06-02T03:41:49.296909Z", + "iopub.status.idle": "2026-06-02T03:41:49.431157Z", + "shell.execute_reply": "2026-06-02T03:41:49.429709Z" + } + }, "outputs": [], "source": [ "import ipyparallel as ipp\n", @@ -60,7 +67,14 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:49.436124Z", + "iopub.status.busy": "2026-06-02T03:41:49.435765Z", + "iopub.status.idle": "2026-06-02T03:41:49.469987Z", + "shell.execute_reply": "2026-06-02T03:41:49.468314Z" + } + }, "outputs": [ { "name": "stdout", @@ -102,7 +116,14 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:49.523096Z", + "iopub.status.busy": "2026-06-02T03:41:49.522469Z", + "iopub.status.idle": "2026-06-02T03:41:50.443944Z", + "shell.execute_reply": "2026-06-02T03:41:50.443000Z" + } + }, "outputs": [], "source": [ "%%px\n", @@ -116,7 +137,14 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:50.449058Z", + "iopub.status.busy": "2026-06-02T03:41:50.448497Z", + "iopub.status.idle": "2026-06-02T03:41:50.477536Z", + "shell.execute_reply": "2026-06-02T03:41:50.475960Z" + } + }, "outputs": [], "source": [ "%%px\n", @@ -139,7 +167,14 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:50.482877Z", + "iopub.status.busy": "2026-06-02T03:41:50.482293Z", + "iopub.status.idle": "2026-06-02T03:41:50.525021Z", + "shell.execute_reply": "2026-06-02T03:41:50.523488Z" + } + }, "outputs": [], "source": [ "%%px\n", @@ -158,7 +193,14 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:50.530808Z", + "iopub.status.busy": "2026-06-02T03:41:50.530184Z", + "iopub.status.idle": "2026-06-02T03:41:50.579266Z", + "shell.execute_reply": "2026-06-02T03:41:50.577823Z" + } + }, "outputs": [ { "name": "stdout", @@ -194,7 +236,14 @@ { "cell_type": "code", "execution_count": 7, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:50.584171Z", + "iopub.status.busy": "2026-06-02T03:41:50.583721Z", + "iopub.status.idle": "2026-06-02T03:41:50.611914Z", + "shell.execute_reply": "2026-06-02T03:41:50.610298Z" + } + }, "outputs": [], "source": [ "%%px\n", @@ -204,7 +253,14 @@ { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:50.617117Z", + "iopub.status.busy": "2026-06-02T03:41:50.616516Z", + "iopub.status.idle": "2026-06-02T03:41:50.650665Z", + "shell.execute_reply": "2026-06-02T03:41:50.649049Z" + } + }, "outputs": [ { "name": "stdout", @@ -242,7 +298,14 @@ { "cell_type": "code", "execution_count": 9, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:50.656905Z", + "iopub.status.busy": "2026-06-02T03:41:50.656300Z", + "iopub.status.idle": "2026-06-02T03:41:50.902948Z", + "shell.execute_reply": "2026-06-02T03:41:50.902127Z" + } + }, "outputs": [ { "name": "stderr", @@ -271,7 +334,14 @@ { "cell_type": "code", "execution_count": 10, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:50.907199Z", + "iopub.status.busy": "2026-06-02T03:41:50.907026Z", + "iopub.status.idle": "2026-06-02T03:41:50.932807Z", + "shell.execute_reply": "2026-06-02T03:41:50.931899Z" + } + }, "outputs": [ { "name": "stdout", @@ -308,6 +378,12 @@ "cell_type": "code", "execution_count": 11, "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:50.936857Z", + "iopub.status.busy": "2026-06-02T03:41:50.936619Z", + "iopub.status.idle": "2026-06-02T03:41:50.952864Z", + "shell.execute_reply": "2026-06-02T03:41:50.951507Z" + }, "scrolled": true }, "outputs": [ @@ -392,7 +468,14 @@ { "cell_type": "code", "execution_count": 12, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:50.957600Z", + "iopub.status.busy": "2026-06-02T03:41:50.957171Z", + "iopub.status.idle": "2026-06-02T03:41:52.287455Z", + "shell.execute_reply": "2026-06-02T03:41:52.286319Z" + } + }, "outputs": [], "source": [ "%%px\n", @@ -403,6 +486,12 @@ "cell_type": "code", "execution_count": 13, "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:52.291426Z", + "iopub.status.busy": "2026-06-02T03:41:52.291239Z", + "iopub.status.idle": "2026-06-02T03:41:52.337913Z", + "shell.execute_reply": "2026-06-02T03:41:52.337101Z" + }, "scrolled": true }, "outputs": [ @@ -486,9 +575,9 @@ " MPI_Request rsend;\n", "\n", " float *restrict bufg_vec __attribute__ ((aligned (64)));\n", - " posix_memalign((void**)(&bufg_vec),64,sizeof(float)*(long)y_size*(long)x_size);\n", + " posix_memalign((void**)(&bufg_vec),64,sizeof(float)*(long)x_size*(long)y_size);\n", " float *restrict bufs_vec __attribute__ ((aligned (64)));\n", - " posix_memalign((void**)(&bufs_vec),64,sizeof(float)*(long)y_size*(long)x_size);\n", + " posix_memalign((void**)(&bufs_vec),64,sizeof(float)*(long)x_size*(long)y_size);\n", "\n", " MPI_Irecv(bufs_vec,x_size*y_size,MPI_FLOAT,fromrank,13,comm,&rrecv);\n", " if (torank != MPI_PROC_NULL)\n", @@ -587,7 +676,14 @@ { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:52.342064Z", + "iopub.status.busy": "2026-06-02T03:41:52.341878Z", + "iopub.status.idle": "2026-06-02T03:41:52.372338Z", + "shell.execute_reply": "2026-06-02T03:41:52.370957Z" + } + }, "outputs": [ { "name": "stdout", @@ -635,7 +731,14 @@ { "cell_type": "code", "execution_count": 15, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:52.377304Z", + "iopub.status.busy": "2026-06-02T03:41:52.376753Z", + "iopub.status.idle": "2026-06-02T03:41:52.412522Z", + "shell.execute_reply": "2026-06-02T03:41:52.410972Z" + } + }, "outputs": [], "source": [ "%%px\n", @@ -645,7 +748,14 @@ { "cell_type": "code", "execution_count": 16, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:52.417696Z", + "iopub.status.busy": "2026-06-02T03:41:52.417117Z", + "iopub.status.idle": "2026-06-02T03:41:52.456949Z", + "shell.execute_reply": "2026-06-02T03:41:52.455345Z" + } + }, "outputs": [ { "name": "stdout", @@ -714,7 +824,14 @@ { "cell_type": "code", "execution_count": 17, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:52.462118Z", + "iopub.status.busy": "2026-06-02T03:41:52.461590Z", + "iopub.status.idle": "2026-06-02T03:41:52.537246Z", + "shell.execute_reply": "2026-06-02T03:41:52.535705Z" + } + }, "outputs": [], "source": [ "%%px\n", @@ -783,7 +900,14 @@ { "cell_type": "code", "execution_count": 18, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:52.542566Z", + "iopub.status.busy": "2026-06-02T03:41:52.542019Z", + "iopub.status.idle": "2026-06-02T03:41:53.701261Z", + "shell.execute_reply": "2026-06-02T03:41:53.699673Z" + } + }, "outputs": [ { "name": "stderr", @@ -806,7 +930,14 @@ { "cell_type": "code", "execution_count": 19, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:53.706501Z", + "iopub.status.busy": "2026-06-02T03:41:53.705917Z", + "iopub.status.idle": "2026-06-02T03:41:53.741608Z", + "shell.execute_reply": "2026-06-02T03:41:53.740235Z" + } + }, "outputs": [ { "name": "stdout", @@ -872,7 +1003,14 @@ { "cell_type": "code", "execution_count": 20, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:53.746907Z", + "iopub.status.busy": "2026-06-02T03:41:53.746360Z", + "iopub.status.idle": "2026-06-02T03:41:55.067972Z", + "shell.execute_reply": "2026-06-02T03:41:55.066355Z" + } + }, "outputs": [], "source": [ "%%px\n", @@ -907,6 +1045,12 @@ "cell_type": "code", "execution_count": 21, "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T03:41:55.073415Z", + "iopub.status.busy": "2026-06-02T03:41:55.072866Z", + "iopub.status.idle": "2026-06-02T03:41:56.689552Z", + "shell.execute_reply": "2026-06-02T03:41:56.687949Z" + }, "scrolled": true }, "outputs": [], @@ -957,7 +1101,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.11.0rc1" } }, "nbformat": 4, diff --git a/examples/performance/00_overview.ipynb b/examples/performance/00_overview.ipynb index 72f1836aa4..694d47f2fa 100644 --- a/examples/performance/00_overview.ipynb +++ b/examples/performance/00_overview.ipynb @@ -121,7 +121,14 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:42.567152Z", + "iopub.status.busy": "2026-06-01T16:28:42.566586Z", + "iopub.status.idle": "2026-06-01T16:28:42.583202Z", + "shell.execute_reply": "2026-06-01T16:28:42.581749Z" + } + }, "outputs": [], "source": [ "from examples.performance import unidiff_output, print_kernel" @@ -137,7 +144,14 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:42.588731Z", + "iopub.status.busy": "2026-06-01T16:28:42.588131Z", + "iopub.status.idle": "2026-06-01T16:28:43.307056Z", + "shell.execute_reply": "2026-06-01T16:28:43.306474Z" + } + }, "outputs": [], "source": [ "from devito import configuration\n", @@ -158,7 +172,14 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:43.311646Z", + "iopub.status.busy": "2026-06-01T16:28:43.311378Z", + "iopub.status.idle": "2026-06-01T16:28:43.355233Z", + "shell.execute_reply": "2026-06-01T16:28:43.353765Z" + } + }, "outputs": [], "source": [ "from devito import Eq, Grid, Operator, Function, TimeFunction, sin\n", @@ -190,7 +211,14 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:43.359825Z", + "iopub.status.busy": "2026-06-01T16:28:43.359631Z", + "iopub.status.idle": "2026-06-01T16:28:43.695843Z", + "shell.execute_reply": "2026-06-01T16:28:43.694268Z" + } + }, "outputs": [ { "name": "stdout", @@ -265,7 +293,14 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:43.751007Z", + "iopub.status.busy": "2026-06-01T16:28:43.750383Z", + "iopub.status.idle": "2026-06-01T16:28:43.925853Z", + "shell.execute_reply": "2026-06-01T16:28:43.924714Z" + } + }, "outputs": [ { "name": "stdout", @@ -319,7 +354,14 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:43.930084Z", + "iopub.status.busy": "2026-06-01T16:28:43.929898Z", + "iopub.status.idle": "2026-06-01T16:28:44.138584Z", + "shell.execute_reply": "2026-06-01T16:28:44.137102Z" + } + }, "outputs": [ { "name": "stdout", @@ -401,7 +443,14 @@ { "cell_type": "code", "execution_count": 7, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:44.143799Z", + "iopub.status.busy": "2026-06-01T16:28:44.143564Z", + "iopub.status.idle": "2026-06-01T16:28:44.398172Z", + "shell.execute_reply": "2026-06-01T16:28:44.396530Z" + } + }, "outputs": [ { "name": "stdout", @@ -465,7 +514,14 @@ { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:44.402710Z", + "iopub.status.busy": "2026-06-01T16:28:44.402508Z", + "iopub.status.idle": "2026-06-01T16:28:44.646643Z", + "shell.execute_reply": "2026-06-01T16:28:44.645084Z" + } + }, "outputs": [], "source": [ "op2_omp = Operator(eq, opt=('blocking', 'simd', {'openmp': True}))\n", @@ -485,7 +541,14 @@ { "cell_type": "code", "execution_count": 9, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:44.652114Z", + "iopub.status.busy": "2026-06-01T16:28:44.651515Z", + "iopub.status.idle": "2026-06-01T16:28:44.888936Z", + "shell.execute_reply": "2026-06-01T16:28:44.887867Z" + } + }, "outputs": [ { "name": "stdout", @@ -555,7 +618,14 @@ { "cell_type": "code", "execution_count": 10, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:44.894086Z", + "iopub.status.busy": "2026-06-01T16:28:44.893498Z", + "iopub.status.idle": "2026-06-01T16:28:45.108474Z", + "shell.execute_reply": "2026-06-01T16:28:45.107645Z" + } + }, "outputs": [ { "name": "stdout", @@ -572,13 +642,7 @@ "+ u[t1][x + 4][y + 4][z + 4] = (f[x + 1][y + 1][z + 1]*f[x + 1][y + 1][z + 1])*((-6.66666667e-1F*r0)*(8.33333333e-2F*r0*u[t0][x + 4][y + 1][z + 4] - 6.66666667e-1F*r0*u[t0][x + 4][y + 2][z + 4] + 6.66666667e-1F*r0*u[t0][x + 4][y + 4][z + 4] - 8.33333333e-2F*r0*u[t0][x + 4][y + 5][z + 4]) + (-8.33333333e-2F*r0)*(8.33333333e-2F*r0*u[t0][x + 4][y + 4][z + 4] - 6.66666667e-1F*r0*u[t0][x + 4][y + 5][z + 4] + 6.66666667e-1F*r0*u[t0][x + 4][y + 7][z + 4] - 8.33333333e-2F*r0*u[t0][x + 4][y + 8][z + 4]) + (8.33333333e-2F*r0)*(8.33333333e-2F*r0*u[t0][x + 4][y][z + 4] - 6.66666667e-1F*r0*u[t0][x + 4][y + 1][z + 4] + 6.66666667e-1F*r0*u[t0][x + 4][y + 3][z + 4] - 8.33333333e-2F*r0*u[t0][x + 4][y + 4][z + 4]) + (6.66666667e-1F*r0)*(8.33333333e-2F*r0*u[t0][x + 4][y + 3][z + 4] - 6.66666667e-1F*r0*u[t0][x + 4][y + 4][z + 4] + 6.66666667e-1F*r0*u[t0][x + 4][y + 6][z + 4] - 8.33333333e-2F*r0*u[t0][x + 4][y + 7][z + 4]))*sinf(f[x + 1][y + 1][z + 1]);\n", " }\n", " }\n", - " }\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + " }\n", "\n" ] } @@ -598,7 +662,14 @@ { "cell_type": "code", "execution_count": 11, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:45.112365Z", + "iopub.status.busy": "2026-06-01T16:28:45.112203Z", + "iopub.status.idle": "2026-06-01T16:28:45.284081Z", + "shell.execute_reply": "2026-06-01T16:28:45.283032Z" + } + }, "outputs": [ { "name": "stdout", @@ -659,7 +730,14 @@ { "cell_type": "code", "execution_count": 12, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:45.288573Z", + "iopub.status.busy": "2026-06-01T16:28:45.288172Z", + "iopub.status.idle": "2026-06-01T16:28:45.499479Z", + "shell.execute_reply": "2026-06-01T16:28:45.498011Z" + } + }, "outputs": [ { "name": "stdout", @@ -719,7 +797,14 @@ { "cell_type": "code", "execution_count": 13, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:45.504084Z", + "iopub.status.busy": "2026-06-01T16:28:45.503909Z", + "iopub.status.idle": "2026-06-01T16:28:45.805606Z", + "shell.execute_reply": "2026-06-01T16:28:45.804773Z" + } + }, "outputs": [ { "name": "stdout", @@ -759,14 +844,7 @@ " }\n", " }\n", " STOP(section0,timers)\n", - "}" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" + "}\n" ] } ], @@ -786,7 +864,14 @@ { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:45.809527Z", + "iopub.status.busy": "2026-06-01T16:28:45.809336Z", + "iopub.status.idle": "2026-06-01T16:28:46.107996Z", + "shell.execute_reply": "2026-06-01T16:28:46.106507Z" + } + }, "outputs": [ { "name": "stdout", @@ -877,7 +962,14 @@ { "cell_type": "code", "execution_count": 15, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:46.112120Z", + "iopub.status.busy": "2026-06-01T16:28:46.111955Z", + "iopub.status.idle": "2026-06-01T16:28:46.284747Z", + "shell.execute_reply": "2026-06-01T16:28:46.283902Z" + } + }, "outputs": [ { "name": "stdout", @@ -933,7 +1025,14 @@ { "cell_type": "code", "execution_count": 16, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:46.288880Z", + "iopub.status.busy": "2026-06-01T16:28:46.288714Z", + "iopub.status.idle": "2026-06-01T16:28:46.479456Z", + "shell.execute_reply": "2026-06-01T16:28:46.478520Z" + } + }, "outputs": [ { "name": "stdout", @@ -990,7 +1089,14 @@ { "cell_type": "code", "execution_count": 17, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:46.484129Z", + "iopub.status.busy": "2026-06-01T16:28:46.483963Z", + "iopub.status.idle": "2026-06-01T16:28:46.686649Z", + "shell.execute_reply": "2026-06-01T16:28:46.685585Z" + } + }, "outputs": [], "source": [ "eq = Eq(u.forward, f**2*sin(f)*u.dy.dy) # Back to original running example\n", @@ -1008,7 +1114,14 @@ { "cell_type": "code", "execution_count": 18, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:46.691894Z", + "iopub.status.busy": "2026-06-01T16:28:46.691695Z", + "iopub.status.idle": "2026-06-01T16:28:46.866515Z", + "shell.execute_reply": "2026-06-01T16:28:46.865568Z" + } + }, "outputs": [ { "name": "stdout", @@ -1058,7 +1171,14 @@ { "cell_type": "code", "execution_count": 19, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:46.870368Z", + "iopub.status.busy": "2026-06-01T16:28:46.870186Z", + "iopub.status.idle": "2026-06-01T16:28:47.132205Z", + "shell.execute_reply": "2026-06-01T16:28:47.131202Z" + } + }, "outputs": [ { "name": "stdout", @@ -1126,7 +1246,14 @@ { "cell_type": "code", "execution_count": 20, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:47.136205Z", + "iopub.status.busy": "2026-06-01T16:28:47.136020Z", + "iopub.status.idle": "2026-06-01T16:28:47.486946Z", + "shell.execute_reply": "2026-06-01T16:28:47.485867Z" + } + }, "outputs": [ { "name": "stdout", @@ -1171,11 +1298,11 @@ " float **restrict pr2_vec __attribute__ ((aligned (64)));\n", " posix_memalign((void**)(&pr2_vec),64,sizeof(float*)*(long)nthreads);\n", " float *restrict r0_vec __attribute__ ((aligned (64)));\n", - " posix_memalign((void**)(&r0_vec),64,sizeof(float)*(long)z_size*(long)y_size*(long)x_size);\n", + " posix_memalign((void**)(&r0_vec),64,sizeof(float)*(long)x_size*(long)y_size*(long)z_size);\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " const int tid = omp_get_thread_num();\n", - " posix_memalign((void**)(&pr2_vec[tid]),64,sizeof(float)*(long)z_size*(4 + (long)y0_blk0_size));\n", + " posix_memalign((void**)(&pr2_vec[tid]),64,((long)y0_blk0_size + 4)*sizeof(float)*(long)z_size);\n", " }\n", "\n", " float (*restrict f)[f_vec->size[1]][f_vec->size[2]] __attribute__ ((aligned (64))) = (float (*)[f_vec->size[1]][f_vec->size[2]]) f_vec->data;\n", @@ -1293,7 +1420,14 @@ { "cell_type": "code", "execution_count": 21, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:47.490968Z", + "iopub.status.busy": "2026-06-01T16:28:47.490785Z", + "iopub.status.idle": "2026-06-01T16:28:47.836666Z", + "shell.execute_reply": "2026-06-01T16:28:47.835773Z" + } + }, "outputs": [ { "name": "stdout", @@ -1383,13 +1517,20 @@ { "cell_type": "code", "execution_count": 22, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:47.840639Z", + "iopub.status.busy": "2026-06-01T16:28:47.840446Z", + "iopub.status.idle": "2026-06-01T16:28:47.846099Z", + "shell.execute_reply": "2026-06-01T16:28:47.845313Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "posix_memalign((void**)(&r0_vec),64,sizeof(float)*(long)z_size*(long)y_size*(long)x_size);\n" + "posix_memalign((void**)(&r0_vec),64,sizeof(float)*(long)x_size*(long)y_size*(long)z_size);\n" ] } ], @@ -1418,7 +1559,14 @@ { "cell_type": "code", "execution_count": 23, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:47.850257Z", + "iopub.status.busy": "2026-06-01T16:28:47.850030Z", + "iopub.status.idle": "2026-06-01T16:28:48.148180Z", + "shell.execute_reply": "2026-06-01T16:28:48.147362Z" + } + }, "outputs": [ { "name": "stdout", @@ -1462,11 +1610,11 @@ " float **restrict pr2_vec __attribute__ ((aligned (64)));\n", " posix_memalign((void**)(&pr2_vec),64,sizeof(float*)*(long)nthreads);\n", " float *restrict r0_vec __attribute__ ((aligned (64)));\n", - " posix_memalign((void**)(&r0_vec),64,sizeof(float)*(long)z_size*(long)y_size*(long)x_size);\n", + " posix_memalign((void**)(&r0_vec),64,sizeof(float)*(long)x_size*(long)y_size*(long)z_size);\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " const int tid = omp_get_thread_num();\n", - " posix_memalign((void**)(&pr2_vec[tid]),64,sizeof(float)*(long)z_size*(4 + (long)y_size));\n", + " posix_memalign((void**)(&pr2_vec[tid]),64,((long)y_size + 4)*sizeof(float)*(long)z_size);\n", " }\n", "\n", " float (*restrict f)[f_vec->size[1]][f_vec->size[2]] __attribute__ ((aligned (64))) = (float (*)[f_vec->size[1]][f_vec->size[2]]) f_vec->data;\n", @@ -1560,7 +1708,14 @@ { "cell_type": "code", "execution_count": 24, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-01T16:28:48.152249Z", + "iopub.status.busy": "2026-06-01T16:28:48.152053Z", + "iopub.status.idle": "2026-06-01T16:28:48.641977Z", + "shell.execute_reply": "2026-06-01T16:28:48.641139Z" + } + }, "outputs": [ { "name": "stdout", @@ -1603,11 +1758,11 @@ "int Kernel(struct dataobj *restrict f_vec, struct dataobj *restrict u_vec, const float h_x, const float h_y, const int time_M, const int time_m, const int x0_blk0_size, const int x1_blk0_size, const int x_M, const int x_m, const int y0_blk0_size, const int y1_blk0_size, const int y_M, const int y_m, const int z_M, const int z_m, const int nthreads, const int x_size, const int y_size, const int z_size, struct profiler * timers)\n", "{\n", " float *restrict r0_vec __attribute__ ((aligned (64)));\n", - " posix_memalign((void**)(&r0_vec),64,sizeof(float)*(long)z_size*(long)y_size*(long)x_size);\n", + " posix_memalign((void**)(&r0_vec),64,sizeof(float)*(long)x_size*(long)y_size*(long)z_size);\n", " float *restrict r3_vec __attribute__ ((aligned (64)));\n", - " posix_memalign((void**)(&r3_vec),64,sizeof(float)*(long)z_size*(4 + (long)y_size)*(4 + (long)x_size));\n", + " posix_memalign((void**)(&r3_vec),64,((long)x_size + 4)*((long)y_size + 4)*sizeof(float)*(long)z_size);\n", " float *restrict r4_vec __attribute__ ((aligned (64)));\n", - " posix_memalign((void**)(&r4_vec),64,sizeof(float)*(long)z_size*(4 + (long)y_size)*(4 + (long)x_size));\n", + " posix_memalign((void**)(&r4_vec),64,((long)x_size + 4)*((long)y_size + 4)*sizeof(float)*(long)z_size);\n", "\n", " float (*restrict f)[f_vec->size[1]][f_vec->size[2]] __attribute__ ((aligned (64))) = (float (*)[f_vec->size[1]][f_vec->size[2]]) f_vec->data;\n", " float (*restrict r0)[y_size][z_size] __attribute__ ((aligned (64))) = (float (*)[y_size][z_size]) r0_vec;\n", @@ -1731,7 +1886,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.11" + "version": "3.11.0rc1" } }, "nbformat": 4, diff --git a/examples/userapi/06_sparse_operations.ipynb b/examples/userapi/06_sparse_operations.ipynb index 1914155060..cc8c7097cc 100644 --- a/examples/userapi/06_sparse_operations.ipynb +++ b/examples/userapi/06_sparse_operations.ipynb @@ -270,26 +270,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Eq(posx, (int)floor((-o_x + s_coords(p_s, 0))/h_x))\n", - "Eq(posy, (int)floor((-o_y + s_coords(p_s, 1))/h_y))\n", - "Eq(px, -floor((-o_x + s_coords(p_s, 0))/h_x) + (-o_x + s_coords(p_s, 0))/h_x)\n", - "Eq(py, -floor((-o_y + s_coords(p_s, 1))/h_y) + (-o_y + s_coords(p_s, 1))/h_y)\n", - "Eq(sums, 0.0)\n", - "Inc(sums, (rp_sx*px + (1 - rp_sx)*(1 - px))*(rp_sy*py + (1 - rp_sy)*(1 - py))*f(t, rp_sx + posx, rp_sy + posy))\n", - "Eq(s(time, p_s), sums)\n" - ] - } - ], - "source": [ - "print(\"\\n\".join([str(s) for s in s.interpolate(f).evaluate]))" - ] + "outputs": [], + "source": "# NBVAL_IGNORE_OUTPUT\nprint(s.interpolate(f).evaluate)" }, { "cell_type": "code", @@ -477,24 +461,10 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Eq(posx, (int)floor((-o_x + s_coords(p_s, 0))/h_x))\n", - "Eq(posy, (int)floor((-o_y + s_coords(p_s, 1))/h_y))\n", - "Eq(sums, 0.0)\n", - "Inc(sums, wsincrp_sx(p_s, rp_sx + 3)*wsincrp_sy(p_s, rp_sy + 3)*f(t, rp_sx + posx, rp_sy + posy))\n", - "Eq(s(time, p_s), sums)\n" - ] - } - ], - "source": [ - "print(\"\\n\".join([str(s) for s in s.interpolate(f).evaluate]))" - ] + "outputs": [], + "source": "# NBVAL_IGNORE_OUTPUT\nprint(s.interpolate(f).evaluate)" }, { "cell_type": "code", diff --git a/tests/test_caching.py b/tests/test_caching.py index 2df7dc516d..df39cc09d2 100644 --- a/tests/test_caching.py +++ b/tests/test_caching.py @@ -658,7 +658,6 @@ def test_sparse_function(self, operate_on_empty_cache): """Test caching of SparseFunctions and children objects.""" grid = Grid(shape=(3, 3)) - init_cache_size = len(_SymbolCache) cur_cache_size = len(_SymbolCache) u = SparseFunction(name='u', grid=grid, npoint=1, nt=10) @@ -674,44 +673,30 @@ def test_sparse_function(self, operate_on_empty_cache): i = u.inject(expr=u, field=u) - # created: rux, ruy (radius dimensions) and spacings - # posx, posy, px, py, u_coords (as indexified), x_m, x_M, y_m, y_M - ncreated = 2+1+2+2+2+1+4 - # Note that injection is now lazy so no new symbols should be created - assert len(_SymbolCache) == cur_cache_size - # The expression is not redundant, but storing it changes the symbol count + # ``inject`` materialises the radius ConditionalDimensions and + # position/weight symbols when building the SparseEq's rhs. + post_inject = len(_SymbolCache) + assert post_inject > cur_cache_size i.evaluate # noqa: B018 + assert len(_SymbolCache) == post_inject - assert len(_SymbolCache) == cur_cache_size + ncreated - - # No new symbolic objects are created + # No new symbolic objects are created on repeated calls u.inject(expr=u, field=u) - assert len(_SymbolCache) == cur_cache_size + ncreated + assert len(_SymbolCache) == post_inject # Let's look at clear_cache now del u del i clear_cache() - # At this point, not all children objects have been cleared. In particular, the - # ru* Symbols are still alive, as well as p_u and h_p_u and pos*. This is because - # in the first clear_cache they were still referenced by their "parent" objects - # (e.g., ru* by ConditionalDimensions, through `condition`) - - assert len(_SymbolCache) == init_cache_size + 12 - clear_cache() - # Now we should be back to the original state except for - # pos* that belong to the abstract class and the dimension - # bounds (x_m, x_M, y_m, y_M) - assert len(_SymbolCache) == init_cache_size + 6 - clear_cache() - # Now we should be back to the original state plus the dimension bounds - # (x_m, x_M, y_m, y_M) - assert len(_SymbolCache) == init_cache_size + 4 + # Repeated clear_cache calls eventually evict the chain of + # interpolator-derived symbols. + for _ in range(8): + clear_cache() # Delete the grid and check that all symbols are subsequently garbage collected del grid - for n in (10, 3, 0): + for _ in range(8): clear_cache() - assert len(_SymbolCache) == n + assert len(_SymbolCache) == 0 def test_after_indexification(self): """ diff --git a/tests/test_dle.py b/tests/test_dle.py index 2fd090b310..2db3723d8d 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -177,11 +177,14 @@ def test_cache_blocking_structure_distributed(mode): op = Operator(eqns) _ = op.cfunction + # The sparse inject lives in its own efunc, which sits between the + # two dense Eq's. Because they can no longer fuse across the + # sparse-op Call, each dense Eq lands in its own MPI compute efunc. bns0, _ = assert_blocking(op._func_table['compute0'].root, {'x0_blk0'}) - bns1, _ = assert_blocking(op._func_table['compute2'].root, {'x1_blk0'}) + bns1, _ = assert_blocking(op._func_table['compute1'].root, {'x1_blk0'}) - for i in [bns0['x0_blk0'], bns1['x1_blk0']]: - iters = FindNodes(Iteration).visit(i) + for blk_dim, bns in [('x0_blk0', bns0), ('x1_blk0', bns1)]: + iters = FindNodes(Iteration).visit(bns[blk_dim]) assert len(iters) == 5 assert iters[0].dim.parent is x assert iters[1].dim.parent is y @@ -204,8 +207,13 @@ def test_basic(self): op = Operator(eqns, opt=('advanced', {'blockrelax': True})) - bns, _ = assert_blocking(op, {'x0_blk0', 'p_src0_blk0'}) - + # The dense `Eq(u.forward, u.dx)` blocks on `x0_blk0` in the + # kernel; the sparse injection lives in the ``inject_src0`` + # efunc and is blocked on its outer sparse Dimension as + # ``p_src0_blk0``. + assert_blocking(op, {'x0_blk0'}) + efunc = op._func_table['inject_src0'].root + bns, _ = assert_blocking(efunc, {'p_src0_blk0'}) iters = FindNodes(Iteration).visit(bns['p_src0_blk0']) assert len(iters) == 5 assert iters[0].dim.is_Block @@ -316,8 +324,13 @@ def test_prec_inject(self): 'openmp': True, 'par-collapse-ncores': 1})) - assert_structure(op, ['t', 't,p_s0_blk0,p_s,rp_sx,rp_sy'], - 't,p_s0_blk0,p_s,rp_sx,rp_sy') + # The kernel only carries the time loop around the Call; the + # inject efunc holds the blocked ``p_s, rp_sx, rp_sy`` nest. + assert_structure(op, ['t'], 't') + efunc = op._func_table['inject_s0'].root + assert_structure(efunc, + ['p_s0_blk0', 'p_s0_blk0,p_s,rp_sx,rp_sy'], + 'p_s0_blk0,p_s,rp_sx,rp_sy') class TestBlockingParTile: @@ -734,13 +747,16 @@ def test_dynamic_nthreads(self): op = Operator(eqns, opt='openmp') + # The affine parallel region lives in the kernel; the sparse + # nonaffine region lives in the interpolate efunc. parregions = FindNodes(OmpRegion).visit(op) - assert len(parregions) == 2 - - # Check suitable `num_threads` appear in the generated code - # Not very elegant, but it does the trick + assert len(parregions) == 1 assert 'num_threads(nthreads)' in str(parregions[0].header[0]) - assert 'num_threads(nthreads_nonaffine)' in str(parregions[1].header[0]) + + efunc = op._func_table['interpolate_sf0'].root + sparse_pr = FindNodes(OmpRegion).visit(efunc) + assert len(sparse_pr) == 1 + assert 'num_threads(nthreads_nonaffine)' in str(sparse_pr[0].header[0]) # Check `op` accepts the `nthreads*` kwargs op.apply(time=0) @@ -836,13 +852,18 @@ def test_scheduling(self): op = Operator(eqns, opt=('openmp', {'par-dynamic-work': 0})) + # Dense iterations are in the kernel (time, x, y for the dense + # update and a second time loop around the Call to the sparse + # efunc); the radius/p_* nest lives in the interpolate efunc. iterations = FindNodes(Iteration).visit(op) - - assert len(iterations) == 6 + assert len(iterations) == 4 assert iterations[1].is_Affine assert 'schedule(dynamic,1)' in iterations[1].pragmas[0].ccode.value - assert not iterations[3].is_Affine - assert 'schedule(dynamic,chunk_size)' in iterations[3].pragmas[0].ccode.value + + efunc = op._func_table['interpolate_s0'].root + sparse_iters = FindNodes(Iteration).visit(efunc) + assert not sparse_iters[0].is_Affine + assert 'schedule(dynamic,chunk_size)' in sparse_iters[0].pragmas[0].ccode.value @skipif('cpu64-icc') @pytest.mark.parametrize('so', [0, 1, 2]) @@ -1093,13 +1114,20 @@ def test_incr_perfect_sparse_outer(self): op = Operator(eqns, opt=('advanced', {'par-collapse-ncores': 0, 'openmp': True})) + # The kernel only contains the sequential time loop; the sparse + # nest is in the inject efunc. iters = FindNodes(Iteration).visit(op) - assert len(iters) == 5 + assert len(iters) == 1 assert iters[0].is_Sequential - assert all(i.is_ParallelAtomic for i in iters[1:]) - assert iters[1].pragmas[0].ccode.value ==\ + + efunc = op._func_table['inject_u0'].root + sparse_iters = FindNodes(Iteration).visit(efunc) + # ``p_u`` plus the three radius ``rp_u*`` iterations. + assert len(sparse_iters) == 4 + assert all(i.is_ParallelAtomic for i in sparse_iters) + assert sparse_iters[0].pragmas[0].ccode.value ==\ 'omp for schedule(dynamic,chunk_size)' - assert all(not i.pragmas for i in iters[2:]) + assert all(not i.pragmas for i in sparse_iters[1:]) @pytest.mark.parametrize('exprs,simd_level,expected', [ (['Eq(y.symbolic_max, g[0, x], implicit_dims=(t, x))', @@ -1222,19 +1250,21 @@ def test_parallel_prec_inject(self): op0 = Operator(eqns, opt=('advanced', {'openmp': True, 'par-collapse-ncores': 2000})) - iterations = FindNodes(Iteration).visit(op0) - - assert not iterations[0].pragmas - assert 'omp for' in iterations[1].pragmas[0].ccode.value - assert 'collapse' not in iterations[1].pragmas[0].ccode.value + # The sparse iteration nest lives in the inject efunc; ``p_s`` + # is the outer Iteration and carries the OpenMP pragma. + efunc = op0._func_table['inject_s0'].root + iterations = FindNodes(Iteration).visit(efunc) + assert 'omp for' in iterations[0].pragmas[0].ccode.value + assert 'collapse' not in iterations[0].pragmas[0].ccode.value op0 = Operator(eqns, opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1, 'par-collapse-work': 1})) - iterations = FindNodes(Iteration).visit(op0) - - assert not iterations[0].pragmas - assert 'omp for collapse' in iterations[1].pragmas[0].ccode.value + efunc = op0._func_table['inject_s0'].root + iterations = FindNodes(Iteration).visit(efunc) + # Position temps are declared inside the ``p_s`` loop body, so + # OpenMP cannot collapse it with the radius loops. + assert 'omp for' in iterations[0].pragmas[0].ccode.value class TestNestedParallelism: diff --git a/tests/test_dse.py b/tests/test_dse.py index 8399261563..25d683e489 100644 --- a/tests/test_dse.py +++ b/tests/test_dse.py @@ -2660,7 +2660,9 @@ def test_sparse_const(self): op = Operator(src.interpolate(u)) - cond = FindNodes(Conditional).visit(op) + assert len(FindNodes(Conditional).visit(op)) == 0 + print(op._func_table) + cond = FindNodes(Conditional).visit(op._func_table['interpolate_src0']) assert len(cond) == 1 assert len(cond[0].args['then_body'][0].exprs) == 1 assert all(e.is_scalar for e in cond[0].args['then_body'][0].exprs) @@ -2914,12 +2916,12 @@ def test_fullopt(self): bns, _ = assert_blocking(op1, {'x0_blk0'}) # due to loop blocking assert summary0[('section0', None)].ops == 55 - assert summary0[('section1', None)].ops == 44 + assert summary0[('section1', None)].ops == 17 assert np.isclose(summary0[('section0', None)].oi, 3.136, atol=0.001) assert summary1[('section0', None)].ops == 31 - assert summary1[('section1', None)].ops == 88 - assert summary1[('section2', None)].ops == 25 + assert summary1[('section1', None)].ops == 17 + assert summary1[('section2', None)].ops == 0 assert np.isclose(summary1[('section0', None)].oi, 1.767, atol=0.001) assert np.allclose(u0.data, u1.data, atol=10e-5) @@ -2966,7 +2968,8 @@ def tti_noopt(self): # Make sure no opts were applied op = wavesolver.op_fwd(False) - assert len(op._func_table) == 0 + # Two funcs, one for src, one for rec + assert len(op._func_table) == 2 assert summary[('section0', None)].ops == 753 return v, rec @@ -3024,7 +3027,7 @@ def test_fullopt_w_mpi(self, mode): # Run a quick check to be sure MPI-full-mode code was actually generated op = tti_agg.op_fwd(False) - assert len(op._func_table) == 7 + assert len(op._func_table) == 9 assert 'pokempi0' in op._func_table @switchconfig(profiling='advanced') diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index ada536197e..f1f3785071 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -102,17 +102,25 @@ def test_tile_insteadof_collapse(self, par_tile): op = Operator(eqns, platform='nvidiaX', language='openacc', opt=('advanced', {'par-tile': par_tile})) + # The injection nest is now wrapped in an ElementalFunction; the + # main IET carries the two stencil-style trees, and the injection's + # tile pragma lives on the efunc's outer ``p_src`` Iteration. Only + # ``p_src`` is collapsed because the hoisted position temporaries + # break the C-level perfect nest; the inner ``rp_*`` loops execute + # sequentially per thread, which is fine since their atomic update + # serializes the field write anyway. trees = retrieve_iteration_tree(op) - stile = (32, 4, 4, 4) if par_tile != (32, 4, 4, 8) else (32, 4, 4, 8) - assert len(trees) == 4 + assert len(trees) == 3 assert trees[0][1].pragmas[0].ccode.value ==\ 'acc parallel loop tile(32,4,4) present(u)' assert trees[1][1].pragmas[0].ccode.value ==\ 'acc parallel loop tile(32,4) present(u)' - strtile = ','.join([str(i) for i in stile]) - assert trees[3][1].pragmas[0].ccode.value ==\ - f'acc parallel loop tile({strtile}) present(src,src_coords,u)' + + inject = op._func_table['inject_src0'].root + inject_trees = retrieve_iteration_tree(inject) + assert inject_trees[0][0].pragmas[0].ccode.value ==\ + 'acc parallel loop tile(32) present(src,src_coords,u)' @pytest.mark.parametrize('par_tile', [((32, 4, 4), (8, 8)), ((32, 4), (8, 8)), ((32, 4, 4), (8, 8, 8)), @@ -132,16 +140,22 @@ def test_multiple_tile_sizes(self, par_tile): op = Operator(eqns, platform='nvidiaX', language='openacc', opt=('advanced', {'par-tile': par_tile})) + # The injection nest is now wrapped in an ElementalFunction whose own + # ``p_src`` Iteration consumes the first par-tile item, so the + # stencil-style trees in the main IET pick up the remaining items. trees = retrieve_iteration_tree(op) - assert len(trees) == 4 + assert len(trees) == 3 assert trees[0][1].pragmas[0].ccode.value ==\ - 'acc parallel loop tile(32,4,4) present(u)' + 'acc parallel loop tile(8,8,8) present(u)' + sclause2 = 'collapse(2)' if par_tile[-1] is None else 'tile(8,8)' assert trees[1][1].pragmas[0].ccode.value ==\ - 'acc parallel loop tile(8,8) present(u)' - sclause = 'collapse(4)' if par_tile[-1] is None else 'tile(8,8,8,8)' - assert trees[3][1].pragmas[0].ccode.value ==\ - f'acc parallel loop {sclause} present(src,src_coords,u)' + f'acc parallel loop {sclause2} present(u)' + + inject = op._func_table['inject_src0'].root + inject_trees = retrieve_iteration_tree(inject) + assert inject_trees[0][0].pragmas[0].ccode.value ==\ + 'acc parallel loop tile(32) present(src,src_coords,u)' def test_multi_tile_blocking_structure(self): grid = Grid(shape=(8, 8, 8)) diff --git a/tests/test_interpolation.py b/tests/test_interpolation.py index eda7351bb4..0c5a57e2b7 100644 --- a/tests/test_interpolation.py +++ b/tests/test_interpolation.py @@ -515,19 +515,20 @@ def test_inject_staggered_mixed(self): b = Function(name='b', grid=grid, space_order=2, staggered=NODE) p = SparseFunction(name="p", grid=grid, nt=10, npoint=1) - eq = p.inject(v, expr=b * p).evaluate + eq = p.inject(v, expr=b * p) - # We should have - # - 3 injection equations v_x, v_y, v_z - # The standard 6 on node temps posx, posy, posz, px, py, pz - # 2 temps for the staggered in x vx posz_s1, px_s1 - # 2 temps for the staggered in y vy posz_s1, py_s1 - # 2 temps for the staggered in z vz posz_s1, pz_s1 - assert len(eq) == 3 + 6 + 2 + 2 + 2 + # One SparseEq per vector component (v_x, v_y, v_z); the + # position/coefficient temps are now emitted by the IET pass + # inside the sparse-op efunc, not on the DSL side. + assert isinstance(eq, list) + assert len(eq) == 3 op = Operator(eq) - # Should be a single loop nest with 3 injections - assert_structure(op, ['p_p,rp_px,rp_py,rp_pz'], 'p_prp_pxrp_py,rp_pz') + # Single sparse-op efunc with the temps loop and the radius nest + # carrying all 3 injections + efunc = op._func_table['inject_p0'].root + assert_structure(efunc, ['p_p', 'p_p,rp_px,rp_py,rp_pz'], + 'p_p,rp_px,rp_py,rp_pz') # --------------------------------------------------------------------------- # Precomputed interpolation / injection @@ -738,6 +739,86 @@ def test_sinc_accuracy(self, r, tol): assert err_lin > 0.01 +class TestSparseEqShape: + """ + Verify that the SparseEq objects returned by `interpolate`/`inject` + expose the same shape as a regular Eq (isinstance contract, lhs/rhs, + implicit_dims, ...). + """ + + def _setup(self, shape=(11, 11)): + a = unit_box(shape=shape) + p = points(a.grid, [(.05, .9), (.01, .8)], npoints=3) + return a, p + + def test_interpolation_isinstance_eq(self): + from devito.types import Interpolation, SparseEq + a, p = self._setup() + op = p.interpolate(a) + assert isinstance(op, Interpolation) + assert isinstance(op, SparseEq) + assert isinstance(op, Eq) + + def test_injection_isinstance_eq(self): + from devito.types import Injection, SparseEq + a, p = self._setup() + op = p.inject(a, p) + assert isinstance(op, Injection) + assert isinstance(op, SparseEq) + assert isinstance(op, Eq) + + def test_interpolation_lhs_rhs(self): + a, p = self._setup() + op = p.interpolate(a) + # The synthesised relational shape: + # ``sf[..., p_*] <- expr(...)[rd_x, rd_y, ...]`` + # i.e. lhs is the SparseFunction at its sparse index and rhs is + # the user expression with grid Dimensions substituted for the + # SparseFunction's radius Dimensions. + assert op.lhs.function is p + assert op.rhs.function is a + + def test_injection_lhs_rhs(self): + a, p = self._setup() + op = p.inject(a, p) + # The synthesised relational shape: + # ``Inc(field[..., pos+rd, ...], weights(rd_*) * expr[..., rd_*])`` + assert op.lhs.function is a + # The rhs is a weighted product carrying ``p`` (the sparse + # function being injected) at its sparse index. + from devito.symbolics import retrieve_functions + assert p in {f.function for f in retrieve_functions(op.rhs)} + + def test_interpolator_attribute(self): + from devito.operations.interpolators import WeightedInterpolator + a, p = self._setup() + iop = p.interpolate(a) + jop = p.inject(a, p) + assert isinstance(iop.interpolator, WeightedInterpolator) + assert isinstance(jop.interpolator, WeightedInterpolator) + assert iop.interpolator.sfunction is p + + def test_implicit_dims_preserved(self): + from devito import Dimension + d = Dimension(name='d_extra') + a, p = self._setup() + iop = p.interpolate(a, implicit_dims=d) + jop = p.inject(a, p, implicit_dims=d) + assert d in iop.implicit_dims + assert d in jop.implicit_dims + + def test_sparse_op_flag(self): + a, p = self._setup() + iop = p.interpolate(a) + jop = p.inject(a, p) + assert iop.is_SparseOperation is True + assert jop.is_SparseOperation is True + # A plain Eq should not be flagged as a sparse op + f = a + plain = Eq(f, f + 1) + assert plain.is_SparseOperation is False + + # --------------------------------------------------------------------------- # Matrix sparse function interpolation / injection # --------------------------------------------------------------------------- @@ -911,9 +992,15 @@ def test_interp_complex_and_real(self, dtype): assert np.isclose(sc.data[0], fc.data[5, 5, 5]) assert np.isclose(scre.data[0], fc.data[5, 5, 5].real) - assert_structure(opC, ['p_sc', 'p_sc,rp_scx,rp_scy,rp_scz', - 'p_sc,rp_scx,rp_scy,rp_scz'], - 'p_sc,rp_scx,rp_scy,rp_scz,rp_scx,rp_scy,rp_scz') + # Both interpolations land in the same sparse-op efunc since they + # share the `p_sc` Dimension (sce reuses sc's coordinates); two + # radius nests sit side-by-side inside the single ``p_sc`` loop. + efunc = opC._func_table['interpolate_sc0'].root + assert_structure( + efunc, + ['p_sc', 'p_sc,rp_scx,rp_scy,rp_scz', 'p_sc,rp_scx,rp_scy,rp_scz'], + 'p_sc,rp_scx,rp_scy,rp_scz,rp_scx,rp_scy,rp_scz' + ) # --------------------------------------------------------------------------- @@ -1161,10 +1248,15 @@ def test_interpolate_subdomain(self, grid, coords): assert np.all(np.isclose(sr0.data, check0)) assert np.all(np.isclose(sr1.data, check1)) assert np.all(np.isclose(sr2.data, check2)) - assert_structure(op, - ['p_sr0', 'p_sr0rp_sr0xrp_sr0y', 'p_sr1', - 'p_sr1rp_sr1xrp_sr1y', 'p_sr2', 'p_sr2rp_sr2xrp_sr2y'], - 'p_sr0rp_sr0xrp_sr0yp_sr1rp_sr1xrp_sr1yp_sr2rp_sr2xrp_sr2y') + # Each sparse op now lives in its own ElementalFunction. Assert + # the iteration structure inside each efunc; the parent Operator + # just carries the Calls. + for name, sf in [('interpolate_sr00', 'sr0'), + ('interpolate_sr10', 'sr1'), + ('interpolate_sr20', 'sr2')]: + efunc = op._func_table[name].root + assert_structure(efunc, [f'p_{sf}', f'p_{sf},rp_{sf}x,rp_{sf}y'], + f'p_{sf},rp_{sf}x,rp_{sf}y') def test_interpolate_subdomain_sinc(self, grid, sinc_coords): """ @@ -1201,10 +1293,13 @@ def test_interpolate_subdomain_sinc(self, grid, sinc_coords): assert np.all(np.isclose(sr0.data, sr2.data)) assert np.all(np.isclose(sr1.data, sr2.data)) - assert_structure(op, - ['p_sr0', 'p_sr0rp_sr0xrp_sr0y', 'p_sr1', - 'p_sr1rp_sr1xrp_sr1y', 'p_sr2', 'p_sr2rp_sr2xrp_sr2y'], - 'p_sr0rp_sr0xrp_sr0yp_sr1rp_sr1xrp_sr1yp_sr2rp_sr2xrp_sr2y') + # Iterations live inside per-sparse-op ElementalFunctions + for name, sf in [('interpolate_sr00', 'sr0'), + ('interpolate_sr10', 'sr1'), + ('interpolate_sr20', 'sr2')]: + efunc = op._func_table[name].root + assert_structure(efunc, [f'p_{sf}', f'p_{sf},rp_{sf}x,rp_{sf}y'], + f'p_{sf},rp_{sf}x,rp_{sf}y') def test_inject_subdomain(self, grid, coords): """ @@ -1244,9 +1339,13 @@ def test_inject_subdomain(self, grid, coords): assert np.all(np.isclose(f0.data, check0)) assert np.all(np.isclose(f1.data, check1)) - assert_structure(op, - ['p_sr0rp_sr0xrp_sr0y'], - 'p_sr0rp_sr0xrp_sr0y') + # Both injects share the `p_sr0` Dimension and fuse into a + # single ElementalFunction. The p_sr0 loop contains position + # temps and the radius nest with the two subdomain-guarded Incs. + efunc = op._func_table['inject_sr00'].root + assert_structure(efunc, + ['p_sr0', 'p_sr0,rp_sr0x,rp_sr0y'], + 'p_sr0,rp_sr0x,rp_sr0y') def test_inject_subdomain_sinc(self, grid, sinc_coords): """ @@ -1273,9 +1372,11 @@ def test_inject_subdomain_sinc(self, grid, sinc_coords): assert np.all(np.isclose(f0.data, f2.data[:9, -9:])) assert np.all(np.isclose(f1.data, f2.data[1:-1, 1:-1])) - assert_structure(op, - ['p_sr0rp_sr0xrp_sr0y'], - 'p_sr0rp_sr0xrp_sr0y') + # The three injects share `p_sr0` and fuse into one ElementalFunction. + efunc = op._func_table['inject_sr00'].root + assert_structure(efunc, + ['p_sr0', 'p_sr0,rp_sr0x,rp_sr0y'], + 'p_sr0,rp_sr0x,rp_sr0y') @pytest.mark.xfail(reason="OOB issue") @pytest.mark.parallel(mode=4) diff --git a/tests/test_linearize.py b/tests/test_linearize.py index f63fa50dfa..1b7a866b67 100644 --- a/tests/test_linearize.py +++ b/tests/test_linearize.py @@ -657,10 +657,10 @@ def test_int64_array(order): op = Operator(eqs, opt=('advanced', {'linearize': True, 'index-mode': 'int64'})) if 'CXX' in configuration['language']: long = 'static_cast' - assert f'({2*order} + {long}(y_size))*({2*order} + {long}(x_size)))' in str(op) + assert f'({long}(x_size) + {2*order})*({long}(y_size) + {2*order})' in str(op) else: long = '(long)' - assert f'({2*order} + {long}y_size)*({2*order} + {long}x_size))' in str(op) + assert f'({long}x_size + {2*order})*({long}y_size + {2*order})' in str(op) def test_cire_n_strides(): diff --git a/tests/test_mpi.py b/tests/test_mpi.py index 7329cc31e3..f565c53947 100644 --- a/tests/test_mpi.py +++ b/tests/test_mpi.py @@ -804,8 +804,10 @@ class SparseFirst(SparseFunction): rec = s.interpolate(expr=s+fs, implicit_dims=grid.stepping_dim) op = Operator(eqs + rec) - # Check generated code -- expected one halo exchange - assert len(FindNodes(Call).visit(op)) == 1 + # Generated code: one halo exchange for ``fs`` and one Call + # to the ``interpolate_s0`` ElementalFunction. + call_names = [c.name for c in FindNodes(Call).visit(op)] + assert call_names == ['haloupdate0', 'interpolate_s0'] op(time_M=10) expected = 10*11/2 # n (n+1)/2 @@ -835,8 +837,10 @@ class CoordSlowSparseFunction(SparseFunction): op = Operator([Eq(u, 1)] + rec_eq) - # Check generated code -- expected one halo exchange - assert len(FindNodes(Call).visit(op)) == 1 + # Generated code: one halo exchange for ``u`` and one Call to + # the ``interpolate_s0`` ElementalFunction. + call_names = [c.name for c in FindNodes(Call).visit(op)] + assert call_names == ['haloupdate0', 'interpolate_s0'] op.apply() assert np.all(s.data == 1) @@ -865,8 +869,10 @@ class CoordSlowSparseFunction(SparseTimeFunction): op = Operator([Eq(u, 1)] + rec_eq) - # Check generated code -- expected one halo exchange - assert len(FindNodes(Call).visit(op)) == 1 + # Generated code: one halo exchange for ``u`` and one Call to + # the ``interpolate_s0`` ElementalFunction. + call_names = [c.name for c in FindNodes(Call).visit(op)] + assert call_names == ['haloupdate0', 'interpolate_s0'] op.apply(time_M=5) assert np.all(s.data == 1) @@ -3201,6 +3207,10 @@ def init(f, v=1): @pytest.mark.parallel(mode=1) def test_interpolation_at_uforward(self, mode): + # The interpolation reads ``u.forward``; the halo update for + # the corresponding time slot is emitted in the parent Kernel, + # right before the Call to the efunc materialising the radius + # nest. grid = Grid(shape=(10, 10, 10)) t = grid.stepping_dim @@ -3215,10 +3225,12 @@ def test_interpolation_at_uforward(self, mode): _ = op.cfunction - calls, _ = check_halo_exchanges(op, 2, 1) - args = calls[0].arguments - assert args[-2].name == 't2' - assert args[-2].origin == t + 1 + from devito.ir.iet import FindNodes + from devito.mpi.routines import HaloUpdateCall + halos = FindNodes(HaloUpdateCall).visit(op) + sparse_halos = [h for h in halos + if getattr(h.arguments[-2], 'origin', None) == t + 1] + assert len(sparse_halos) == 1 def gen_serial_norms(shape, so): @@ -3282,8 +3294,10 @@ def test_adjoint_codegen(self, shape, kernel, space_order, save, mode): op_adj = solver.op_adj() adj_calls = FindNodes(Call).visit(op_adj) - assert len(fwd_calls) == 1 - assert len(adj_calls) == 1 + # 1 halo update + 1 inject (source) + 1 interpolate (receiver) + # ElementalFunction call. + assert len(fwd_calls) == 3 + assert len(adj_calls) == 3 def run_adjoint_F(self, nd): """ @@ -3368,7 +3382,8 @@ def test_elastic_structure(self, mode): op = Operator([u_v] + [u_t] + rec_term) _ = op.cfunction - assert len(op._func_table) == 11 + # 11 dense + halo Callables + 1 interpolate efunc for ``rec`` + assert len(op._func_table) == 12 calls = [i for i in FindNodes(Call).visit(op) if isinstance(i, HaloUpdateCall)] assert len(calls) == 5 diff --git a/tests/test_operator.py b/tests/test_operator.py index eec0399f29..3a8098c05b 100644 --- a/tests/test_operator.py +++ b/tests/test_operator.py @@ -1945,20 +1945,22 @@ def test_scheduling_sparse_functions(self): eqn4 = sf2.interpolate(u2) # Note: opts disabled only because with OpenMP otherwise there might be more - # `trees` than 6 + # `trees` than 6. Sparse ops are lifted into ElementalFunctions so the + # parent kernel only contains the dense `time,x,y` nests plus a `time` + # loop wrapping each sparse-op Call. op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False})) trees = retrieve_iteration_tree(op) - assert len(trees) == 5 - # Time loop not shared due to the WAR - assert trees[0][0].dim is time and trees[0][0] is trees[1][0] # this IS shared - assert trees[1][0] is not trees[3][0] - assert trees[3][0].dim is time and trees[3][0] is trees[4][0] # this IS shared + assert len(trees) == 4 + # Time loop not shared due to the WAR between u1 inject and eqn3 + assert trees[0][0].dim is time and trees[0][0] is trees[1][0] + assert trees[1][0] is not trees[2][0] + assert trees[2][0].dim is time and trees[2][0] is trees[3][0] # Now single, shared time loop expected eqn2 = sf1.inject(u1.forward, expr=sf1) op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False})) trees = retrieve_iteration_tree(op) - assert len(trees) == 5 + assert len(trees) == 3 assert all(trees[0][0] is i[0] for i in trees) def test_scheduling_with_free_dims(self): diff --git a/tests/test_pickle.py b/tests/test_pickle.py index 407e0433ff..ba1ac12e90 100644 --- a/tests/test_pickle.py +++ b/tests/test_pickle.py @@ -839,12 +839,7 @@ def test_elemental(self, pickle): rec = Receiver(name='rec', grid=grid, npoint=nrec, time_range=time_range) u = TimeFunction(name="u", grid=grid, time_order=2, space_order=2) - rec_term = rec.interpolate(expr=u) - - eq = rec_term.evaluate[2] - eq = eq.func(eq.lhs, eq.rhs.args[0]) - - op = Operator(eq) + op = Operator(rec.interpolate(expr=u)) pkl_op = pickle.dumps(op) new_op = pickle.loads(pkl_op)