Skip to content

Commit 73971d7

Browse files
authored
[Fuzzing] Emit secondary wasm files in ClusterFuzz testcases (#7122)
The two files are then linked and run by fuzz_shell.js (we had this functionality already in order to fuzz wasm-split). By adding multiple build and run commands of both the primary and secondary wasm files, we can end up with multiple instances of two different wasm files that call between themselves. To help testing, add a script that extracts the wasm files from the testcase. This may also be useful in the future for testcase reduction.
1 parent 4ffe272 commit 73971d7

3 files changed

Lines changed: 158 additions & 27 deletions

File tree

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#
2+
# Copyright 2024 WebAssembly Community Group participants
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
'''
17+
Wasm extractor for testcases generated by the ClusterFuzz run.py script. Usage:
18+
19+
extract_wasms.py INFILE.js OUTFILE
20+
21+
That will find embedded wasm files in INFILE.js, of the form
22+
23+
var .. = new Uint8Array([..wasm_contents..]);
24+
25+
and extract them into OUTFILE.0.wasm, OUTFILE.1.wasm, etc. It also emits
26+
OUTFILE.js which will no longer contain the embedded contents, after which the
27+
script can be run as
28+
29+
d8 OUTFILE.js -- OUTFILE.0.wasm
30+
31+
That is, the embedded file can now be provided as a filename argument.
32+
'''
33+
34+
import re
35+
import sys
36+
37+
file_counter = 0
38+
39+
40+
def get_wasm_filename():
41+
global file_counter
42+
file_counter += 1
43+
return f'{out}.{file_counter - 1}.wasm'
44+
45+
46+
in_js = sys.argv[1]
47+
out = sys.argv[2]
48+
49+
with open(in_js) as f:
50+
js = f.read()
51+
52+
53+
def repl(text):
54+
# We found something of the form
55+
#
56+
# var binary = new Uint8Array([..binary data as numbers..]);
57+
#
58+
# Parse out the numbers into a binary wasm file.
59+
numbers = text.groups()[0]
60+
numbers = numbers.split(',')
61+
numbers = [int(n) for n in numbers]
62+
with open(get_wasm_filename(), 'wb') as f:
63+
f.write(bytes(numbers))
64+
65+
# Replace it with nothing.
66+
return ''
67+
68+
69+
# Replace the wasm files and write them out.
70+
js = re.sub(r'var \w+ = new Uint8Array\(\[([\d,]+)\]\);', repl, js)
71+
72+
# Write out the new JS.
73+
with open(f'{out}.js', 'w') as f:
74+
f.write(js)

scripts/clusterfuzz/run.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,18 @@ def get_js_file_contents(i, output_dir):
150150
# Prepend the wasm contents, so they are used (rather than the normal
151151
# mechanism where the wasm file's name is provided in argv).
152152
wasm_contents = get_wasm_contents(i, output_dir)
153-
js = f'var binary = {wasm_contents};\n\n' + js
153+
pre = f'var binary = {wasm_contents};\n'
154+
bytes = wasm_contents.count(',')
155+
156+
# Sometimes add a second wasm file as well.
157+
has_second = False
158+
if system_random.random() < 0.333:
159+
has_second = True
160+
wasm_contents = get_wasm_contents(i, output_dir)
161+
pre += f'var secondBinary = {wasm_contents};\n'
162+
bytes += wasm_contents.count(',')
163+
164+
js = pre + '\n' + js
154165

155166
# The default JS builds and runs the wasm. Append some random additional
156167
# operations as well, as more compiles and executions can find things. To
@@ -171,16 +182,23 @@ def get_js_file_contents(i, output_dir):
171182
x = math.pow(x, power)
172183
num = math.floor(x * MAX_EXTRA_JS_OPERATIONS)
173184
assert num >= 0 and num <= MAX_EXTRA_JS_OPERATIONS
185+
186+
extra_js_operations = [
187+
# Compile and link the wasm again. Each link adds more to the total
188+
# exports that we can call.
189+
'build(binary);\n',
190+
# Run all the exports we've accumulated.
191+
'callExports();\n',
192+
]
193+
if has_second:
194+
extra_js_operations += [
195+
'build(secondBinary);\n',
196+
]
197+
174198
for i in range(num):
175-
js += system_random.choice([
176-
# Compile and link the wasm again. Each link adds more to the total
177-
# exports that we can call.
178-
'build(binary);\n',
179-
# Run all the exports we've accumulated.
180-
'callExports();\n',
181-
])
182-
183-
print(f'Created {wasm_contents.count(",")} wasm bytes')
199+
js += system_random.choice(extra_js_operations)
200+
201+
print(f'Created {bytes} wasm bytes')
184202

185203
return js
186204

test/unit/test_cluster_fuzz.py

Lines changed: 56 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import glob
12
import os
23
import platform
34
import re
@@ -159,6 +160,9 @@ def test_file_contents(self):
159160
seen_sizes = []
160161
seen_exports = []
161162

163+
# Second wasm files are also emitted sometimes.
164+
seen_second_sizes = []
165+
162166
# The number of struct.news appears in the metrics report like this:
163167
#
164168
# StructNew : 18
@@ -179,23 +183,16 @@ def test_file_contents(self):
179183
with open(flags_file) as f:
180184
self.assertEqual(f.read(), '--wasm-staging')
181185

182-
# The fuzz files begin with
183-
#
184-
# var binary = new Uint8Array([..binary data as numbers..]);
185-
#
186-
with open(fuzz_file) as f:
187-
first_line = f.readline().strip()
188-
start = 'var binary = new Uint8Array(['
189-
end = ']);'
190-
self.assertTrue(first_line.startswith(start))
191-
self.assertTrue(first_line.endswith(end))
192-
numbers = first_line[len(start):-len(end)]
193-
194-
# Convert to binary, and see that it is a valid file.
195-
numbers_array = [int(x) for x in numbers.split(',')]
196-
binary_file = os.path.join(temp_dir.name, 'file.wasm')
197-
with open(binary_file, 'wb') as f:
198-
f.write(bytes(numbers_array))
186+
# Extract the wasm file(s) from the JS. Make sure to not notice
187+
# stale files.
188+
for f in glob.glob('extracted*'):
189+
os.unlink(f)
190+
extractor = shared.in_binaryen('scripts', 'clusterfuzz', 'extract_wasms.py')
191+
subprocess.check_call([sys.executable, extractor, fuzz_file, 'extracted'])
192+
193+
# One wasm file must always exist, and must be valid.
194+
binary_file = 'extracted.0.wasm'
195+
assert os.path.exists(binary_file)
199196
metrics = subprocess.check_output(
200197
shared.WASM_OPT + ['-all', '--metrics', binary_file, '-q'], text=True)
201198

@@ -215,6 +212,19 @@ def test_file_contents(self):
215212
self.assertEqual(len(exports), 1)
216213
seen_exports.append(int(exports[0]))
217214

215+
# Sometimes a second wasm file should exist, and it must be valid
216+
# too.
217+
second_binary_file = 'extracted.1.wasm'
218+
if os.path.exists(second_binary_file):
219+
subprocess.check_call(
220+
shared.WASM_OPT + ['-all', second_binary_file, '-q'])
221+
222+
# Note its size (we leave detailed metrics for the first one;
223+
# they are generated by the same logic in run.py, so just
224+
# verifying some valid second wasms are emitted, of random
225+
# sizes, is enough).
226+
seen_second_sizes.append(os.path.getsize(second_binary_file))
227+
218228
print()
219229

220230
print('struct.news are distributed as ~ mean 15, stddev 24, median 10')
@@ -247,17 +257,35 @@ def test_file_contents(self):
247257

248258
print()
249259

260+
# Second files appear in ~ 1/3 of testcases.
261+
print('number of second wasms should be around 33 +- 8')
262+
print(f'number of second wasms: {len(seen_second_sizes)}')
263+
assert seen_second_sizes, 'must see at least one second wasm'
264+
print('second sizes are distributed as ~ mean 2933, stddev 2011, median 2510')
265+
print(f'mean sizes: {statistics.mean(seen_second_sizes)}')
266+
print(f'stdev sizes: {statistics.stdev(seen_second_sizes)}')
267+
print(f'median sizes: {statistics.median(seen_second_sizes)}')
268+
# Relax the assert on the max seen second size compared to the max seen
269+
# primary size, as we see fewer of these. 500 is still proof of an
270+
# interesting wasm file.
271+
self.assertGreaterEqual(max(seen_second_sizes), 500)
272+
self.assertGreater(statistics.stdev(seen_second_sizes), 0)
273+
274+
print()
275+
250276
# To check for interesting JS file contents, we'll note how many times
251277
# we build and run the wasm.
252278
seen_builds = []
253279
seen_calls = []
280+
seen_second_builds = []
254281

255282
for i in range(1, N + 1):
256283
fuzz_file = os.path.join(temp_dir.name, f'fuzz-binaryen-{i}.js')
257284
with open(fuzz_file) as f:
258285
js = f.read()
259286
seen_builds.append(js.count('build(binary);'))
260287
seen_calls.append(js.count('callExports();'))
288+
seen_second_builds.append(js.count('build(secondBinary);'))
261289

262290
# There is always one build and one call (those are in the default
263291
# fuzz_shell.js), and we add a couple of operations, each with equal
@@ -284,6 +312,17 @@ def test_file_contents(self):
284312

285313
print()
286314

315+
# Second wasm files are more rarely added, only 1/3 of the time or so,
316+
# but over 100 samples we are still overwhelmingly likely to see one.
317+
print('JS second builds are distributed as ~ mean 1.8, stddev 2.2, median 1')
318+
print(f'mean JS second builds: {statistics.mean(seen_second_builds)}')
319+
print(f'stdev JS second builds: {statistics.stdev(seen_second_builds)}')
320+
print(f'median JS second builds: {statistics.median(seen_second_builds)}')
321+
self.assertGreaterEqual(max(seen_second_builds), 2)
322+
self.assertGreater(statistics.stdev(seen_second_builds), 0)
323+
324+
print()
325+
287326
# "zzz" in test name so that this runs last. If it runs first, it can be
288327
# confusing as it appears next to the logging of which bundle we use (see
289328
# setUpClass).

0 commit comments

Comments
 (0)