@@ -26,23 +26,21 @@ For example:
2626.. code-block :: python
2727
2828 import dask.config
29- import dask.dataframe as dd
29+ import dask.array as da
3030 from distributed import Client, span
3131
32+ # Read important note below
3233 dask.config.set({" optimization.fuse.active" : False })
3334 client = Client()
3435
3536 with span(" Alice's workflow" ):
3637 with span(" data load" ):
37- df = dd.read_parquet(... )
38-
38+ a = da.read_zarr(... )
3939 with span(" ML preprocessing" ):
40- df = preprocess(df)
41-
40+ a = preprocess(a)
4241 with span(" Model training" ):
43- model = train(df)
44-
45- model = model.compute()
42+ model = train(a)
43+ model = model.compute()
4644
4745 Note how the :func: `span ` context manager can be nested.
4846The example will create the following spans on the scheduler:
@@ -95,10 +93,16 @@ Additionally, spans can be queried using scheduler extensions or
9593
9694User API
9795--------
98- .. warning ::
96+ .. important ::
9997
100- Spans are based on annotations, and just like annotations they can be lost during
101- optimization. To prevent this issue, you must set
98+ Dataframes have a minimum granularity of a single call to `compute() ` or `persist() `
99+ and can't break it down further into groups of operations - if the example above
100+ used dataframes, everything would have been uniformly tagged as "Alice's Workflow",
101+ as it is the span that's active during `compute() `.
102+
103+ In other collections, such as arrays and delayed objects, spans that don't wrap
104+ around a call to `compute() ` or `persist() ` can get lost during the optimization
105+ phase. To prevent this issue, you must set
102106
103107 >>> dask.config.set({" optimization.fuse.active" : False })
104108
@@ -110,6 +114,23 @@ User API
110114 fuse :
111115 active : false
112116
117+ A possible workaround, that also works for dataframes, can be to perform
118+ intermediate calls to `persist() `. Note however that this can significantly
119+ impact optimizations and reduce overall performance.
120+
121+ .. code-block :: python
122+
123+ with span(" Alice's workflow" ):
124+ with span(" data load" ):
125+ a = dd.read_parquet(... ).persist()
126+ with span(" ML preprocessing" ):
127+ a = preprocess(a).persist()
128+ del a # Release distributed memory for a as soon as possible
129+ with span(" Model training" ):
130+ model = train(b).persist()
131+ del b # Release distributed memory for b as soon as possible
132+ model = model.compute()
133+
113134 .. autofunction :: span
114135
115136
0 commit comments