Skip to content

Commit 8d0f662

Browse files
authored
[Fuzzing] Use initial contents in ClusterFuzz (#7192)
The ClusterFuzz bundler now looks through all of our test suites and packages all testcases that are suitable for ClusterFuzz to use. This adds more variety to the wasm files we fuzz there, as the test suite has corner cases that the main fuzzer is unlikely to generate. This adds a comment in the JS whenever it uses initial content, to make debugging easier, something like [10, 20, 30] /* using initial content 17.wasm */ (this is the reason for the change to extract_wasms.py)
1 parent f9d78d8 commit 8d0f662

4 files changed

Lines changed: 140 additions & 4 deletions

File tree

scripts/bundle_clusterfuzz.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
'''
7272

7373
import os
74+
import subprocess
7475
import sys
7576
import tarfile
7677

@@ -87,7 +88,9 @@
8788
# Delete the argument, as importing |shared| scans it.
8889
sys.argv.pop()
8990

91+
from test import fuzzing # noqa
9092
from test import shared # noqa
93+
from test import support # noqa
9194

9295
# Pick where to get the builds
9396
if build_dir:
@@ -97,6 +100,14 @@
97100
binaryen_bin = shared.options.binaryen_bin
98101
binaryen_lib = shared.options.binaryen_lib
99102

103+
# ClusterFuzz's run.py uses these features. Keep this in sync with that, so that
104+
# we only bundle initial content that makes sense for it.
105+
features = [
106+
'-all',
107+
'--disable-shared-everything',
108+
'--disable-fp16',
109+
]
110+
100111
with tarfile.open(output_file, "w:gz") as tar:
101112
# run.py
102113
run = os.path.join(shared.options.binaryen_root, 'scripts', 'clusterfuzz', 'run.py')
@@ -128,6 +139,40 @@
128139
print(f' ......... : {path}')
129140
tar.add(path, arcname=f'lib/{name}')
130141

142+
# Add tests we will use as initial content under initial/. We put all the
143+
# tests from the test suite there.
144+
print(' .. initial content: ')
145+
temp_wasm = 'temp.wasm'
146+
index = 0
147+
all_tests = shared.get_all_tests()
148+
for i, test in enumerate(all_tests):
149+
if not fuzzing.is_fuzzable(test):
150+
continue
151+
for wast, asserts in support.split_wast(test):
152+
if not wast:
153+
continue
154+
support.write_wast(temp_wasm, wast)
155+
# If the file is not valid for our features, skip it. In the same
156+
# operation, also convert to binary if this was text (binary is more
157+
# compact).
158+
cmd = shared.WASM_OPT + ['-q', temp_wasm, '-o', temp_wasm] + features
159+
if subprocess.run(cmd, stderr=subprocess.PIPE).returncode:
160+
continue
161+
162+
# Looks good.
163+
tar.add(temp_wasm, arcname=f'initial/{index}.wasm')
164+
index += 1
165+
print(f'\r {100 * i / len(all_tests):.2f}%', end='', flush=True)
166+
print(f' (num: {index})')
167+
168+
# Write initial/num.txt which contains the number of testcases in that
169+
# directory (saves run.py from needing to listdir each time).
170+
num_txt = 'num.txt'
171+
with open(num_txt, 'w') as f:
172+
f.write(f'{index}')
173+
tar.add(num_txt, arcname='initial/num.txt')
174+
175+
131176
print('Done.')
132177
print('To run the tests on this bundle, do:')
133178
print()

scripts/clusterfuzz/extract_wasms.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def repl(text):
6767

6868

6969
# Replace the wasm files and write them out.
70-
js = re.sub(r'var \w+ = new Uint8Array\(\[([\d,]+)\]\);', repl, js)
70+
js = re.sub(r'var \w+ = new Uint8Array\(\[([\d,]+)\]\)', repl, js)
7171

7272
# Write out the new JS.
7373
with open(f'{out}.js', 'w') as f:

scripts/clusterfuzz/run.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,12 @@
6868
# testcase.
6969
JS_SHELL_PATH = os.path.join(ROOT_DIR, 'scripts', 'fuzz_shell.js')
7070

71+
# The path to the directory with initial contents.
72+
INITIAL_CONTENT_PATH = os.path.join(ROOT_DIR, 'initial')
73+
74+
# The file that contains the number of initial contents
75+
INITIAL_CONTENT_NUM_PATH = os.path.join(ROOT_DIR, 'initial', 'num.txt')
76+
7177
# The arguments we provide to wasm-opt to generate wasm files.
7278
FUZZER_ARGS = [
7379
# Generate a wasm from random data.
@@ -76,7 +82,8 @@
7682
'--fuzz-passes',
7783
# Enable all features but disable ones not yet ready for fuzzing. This may
7884
# be a smaller set than fuzz_opt.py, as that enables a few experimental
79-
# flags, while here we just fuzz with d8's --wasm-staging.
85+
# flags, while here we just fuzz with d8's --wasm-staging. This should be
86+
# synchonized with bundle_clusterfuzz.
8087
'-all',
8188
'--disable-shared-everything',
8289
'--disable-fp16',
@@ -92,6 +99,17 @@ def get_file_name(prefix, index):
9299
# (We also use urandom below, which uses this under the hood.)
93100
system_random = random.SystemRandom()
94101

102+
# The number of initial content testcases that were bundled for us, in the
103+
# "initial/" subdir.
104+
with open(INITIAL_CONTENT_NUM_PATH) as f:
105+
num_initial_contents = int(f.read())
106+
107+
108+
def get_random_initial_content():
109+
index = system_random.randint(0, num_initial_contents - 1)
110+
return os.path.join(INITIAL_CONTENT_PATH, f'{index}.wasm')
111+
112+
95113
# In production ClusterFuzz we retry whenever we see a wasm-opt error. We are
96114
# not looking for wasm-opt issues there, and just use it to generate testcases
97115
# for VMs. For local testing, however, we may want to disable retrying, which
@@ -117,9 +135,19 @@ def get_wasm_contents(i, output_dir):
117135
with open(input_data_file_path, 'wb') as file:
118136
file.write(os.urandom(random_size))
119137

120-
# Generate wasm from the random data.
138+
# Generate a command to use wasm-opt with the proper args to generate
139+
# wasm content from the input data.
121140
cmd = [FUZZER_BINARY_PATH] + FUZZER_ARGS
122141
cmd += ['-o', wasm_file_path, input_data_file_path]
142+
143+
# Sometimes use a file from the initial content testcases.
144+
if system_random.random() < 0.5:
145+
initial_content = get_random_initial_content()
146+
cmd += ['--initial-fuzz=' + initial_content]
147+
else:
148+
initial_content = None
149+
150+
# Generate wasm from the random data.
123151
try:
124152
subprocess.check_call(cmd)
125153
except subprocess.CalledProcessError:
@@ -148,7 +176,10 @@ def get_wasm_contents(i, output_dir):
148176

149177
# Convert to a string, and wrap into a typed array.
150178
wasm_contents = ','.join([str(c) for c in wasm_contents])
151-
return f'new Uint8Array([{wasm_contents}])'
179+
js = f'new Uint8Array([{wasm_contents}])'
180+
if initial_content:
181+
js = f'{js} /* using initial content {os.path.basename(initial_content)} */'
182+
return js
152183

153184

154185
# Returns the contents of a .js fuzz file, given the index of the testcase and

test/unit/test_cluster_fuzz.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,15 @@ def test_file_contents(self):
282282
seen_calls = []
283283
seen_second_builds = []
284284
seen_JSPIs = []
285+
seen_initial_contents = []
286+
287+
# Initial contents are noted in comments like this:
288+
#
289+
# /* using initial content 42.wasm */
290+
#
291+
# Note that we may see more than one in a file, as we may have more than
292+
# one wasm in each testcase: each wasm has a chance.
293+
initial_content_regex = re.compile(r'[/][*] using initial content ([^ ]+) [*][/]')
285294

286295
for i in range(1, N + 1):
287296
fuzz_file = os.path.join(temp_dir.name, f'fuzz-binaryen-{i}.js')
@@ -302,6 +311,8 @@ def test_file_contents(self):
302311
assert '/* async */' in js
303312
assert '/* await */' in js
304313

314+
seen_initial_contents.append(re.findall(initial_content_regex, js))
315+
305316
# There is always one build and one call (those are in the default
306317
# fuzz_shell.js), and we add a couple of operations, each with equal
307318
# probability to be a build or a call, so over the 100 testcases here we
@@ -346,6 +357,55 @@ def test_file_contents(self):
346357

347358
print()
348359

360+
# Flatten the data to help some of the below, from
361+
# [['a.wasm', 'b.wasm'], ['c.wasm']]
362+
# into
363+
# ['a.wasm', 'b.wasm', 'c.wasm']
364+
flat_initial_contents = [item for items in seen_initial_contents for item in items]
365+
366+
# Initial content appear 50% of the time for each wasm file. Each
367+
# testcase has 1.333 wasm files on average.
368+
print('Initial contents are distributed as ~ mean 0.68')
369+
print(f'mean initial contents: {len(flat_initial_contents) / N}')
370+
# Initial contents should be mostly unique (we have many, many testcases
371+
# and we pick just 100 or so). And we must see more than one unique one.
372+
unique_initial_contents = set(flat_initial_contents)
373+
print(f'unique initial contents: {len(unique_initial_contents)} should be almost equal to {len(flat_initial_contents)}')
374+
self.assertGreater(len(unique_initial_contents), 1)
375+
# Not all testcases have initial contents.
376+
num_initial_contents = [len(items) for items in seen_initial_contents]
377+
self.assertEqual(min(num_initial_contents), 0)
378+
# Some do (this is redundant given that the set of unique initial
379+
# contents was asserted on before, so this just confirms/checks that).
380+
self.assertGreaterEqual(max(num_initial_contents), 1)
381+
382+
print()
383+
384+
# Execute the files in V8. Almost all should execute properly (some
385+
# small number may trap during startup, say on a segment out of bounds).
386+
if shared.V8:
387+
valid_executions = 0
388+
for i in range(1, N + 1):
389+
fuzz_file = os.path.join(temp_dir.name, f'fuzz-binaryen-{i}.js')
390+
391+
cmd = [shared.V8, '--wasm-staging', fuzz_file]
392+
proc = subprocess.run(cmd, stdout=subprocess.PIPE)
393+
394+
# An execution is valid if we exited without error, and if we
395+
# managed to run some code before exiting (modules with no
396+
# exports will be considered "invalid" here, but that is very
397+
# rare, and in a sense they are actually unuseful).
398+
if proc.returncode == 0 and b'[fuzz-exec] calling ' in proc.stdout:
399+
valid_executions += 1
400+
401+
print('Valid executions are distributed as ~ mean 0.99')
402+
print(f'mean valid executions: {valid_executions / N}')
403+
# Assert on having at least half execute properly. Given the true mean
404+
# is 0.9, for half of 100 to fail is incredibly unlikely.
405+
self.assertGreater(valid_executions, N / 2)
406+
407+
print()
408+
349409
# "zzz" in test name so that this runs last. If it runs first, it can be
350410
# confusing as it appears next to the logging of which bundle we use (see
351411
# setUpClass).

0 commit comments

Comments
 (0)