Skip to content

Commit a10aae8

Browse files
authored
[offload] Remove unnecessary extra allocations in kernel replay tool (#193108)
The tool had two extra allocations holding the device memory and globals. Apparently, the AMDGPU plugin failed in the past to transfer data from the file memory mapping, and required these extra buffers. After testing it on MI300A and MI250X, this issue is not present anymore. Thus, we are removing them for now.
1 parent 6c35bdb commit a10aae8

File tree

1 file changed

+9
-23
lines changed

1 file changed

+9
-23
lines changed

offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp

Lines changed: 9 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -240,15 +240,8 @@ Error replayKernel() {
240240
return createErr("failed to read the globals file");
241241
auto GlobalsBuffer = std::move(GlobalsBufferOrErr.get());
242242

243-
// On AMD for currently unknown reasons we cannot copy memory mapped data to
244-
// device. This is a work-around.
245-
uint8_t *RecordedGlobals = new uint8_t[GlobalsBuffer->getBufferSize()];
246-
std::memcpy(RecordedGlobals,
247-
const_cast<char *>(GlobalsBuffer->getBuffer().data()),
248-
GlobalsBuffer->getBufferSize());
249-
250-
void *BufferPtr = (void *)RecordedGlobals;
251-
uint32_t NumGlobals = *((uint32_t *)(BufferPtr));
243+
const void *BufferPtr = const_cast<char *>(GlobalsBuffer->getBufferStart());
244+
uint32_t NumGlobals = *((const uint32_t *)(BufferPtr));
252245
BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t));
253246

254247
SmallVector<llvm::offloading::EntryTy> OffloadEntries(
@@ -268,14 +261,15 @@ Error replayKernel() {
268261
Global.Address = static_cast<char *>(OffloadEntries[0].Address) + I + 1;
269262

270263
// Setup the offload entry using the information from the file.
271-
uint32_t NameSize = *((uint32_t *)(BufferPtr));
264+
uint32_t NameSize = *((const uint32_t *)(BufferPtr));
272265
BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t));
273-
uint64_t Size = *((uint64_t *)(BufferPtr));
266+
uint64_t Size = *((const uint64_t *)(BufferPtr));
274267
BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint64_t));
275268
Global.Size = Size;
276-
Global.SymbolName = (char *)BufferPtr;
269+
Global.SymbolName =
270+
const_cast<char *>(static_cast<const char *>(BufferPtr));
277271
BufferPtr = utils::advancePtr(BufferPtr, NameSize);
278-
Global.AuxAddr = BufferPtr;
272+
Global.AuxAddr = const_cast<void *>(BufferPtr);
279273
BufferPtr = utils::advancePtr(BufferPtr, Size);
280274
}
281275

@@ -320,25 +314,17 @@ Error replayKernel() {
320314
return createErr("failed to read the kernel record input file");
321315
auto RecordInputBuffer = std::move(RecordInputBufferOrErr.get());
322316

323-
// On AMD for currently unknown reasons we cannot copy memory mapped data to
324-
// device. This is a work-around.
325-
uint8_t *RecordedData = new uint8_t[RecordInputBuffer->getBufferSize()];
326-
std::memcpy(RecordedData,
327-
const_cast<char *>(RecordInputBuffer->getBuffer().data()),
328-
RecordInputBuffer->getBufferSize());
329-
330317
KernelReplayOutcomeTy Outcome;
331318
Rc = __tgt_target_kernel_replay(
332319
/*Loc=*/nullptr, DeviceId, OffloadEntries[0].Address,
333-
(char *)RecordedData, RecordInputBuffer->getBufferSize(),
320+
const_cast<char *>(RecordInputBuffer->getBufferStart()),
321+
RecordInputBuffer->getBufferSize(),
334322
NumGlobals ? &OffloadEntries[1] : nullptr, NumGlobals, TgtArgs.data(),
335323
TgtArgOffsets.data(), NumArgs, NumTeams, NumThreads, SharedMemorySize,
336324
LoopTripCount, &Outcome);
337325
if (Rc != OMP_TGT_SUCCESS)
338326
return createErr("failed to replay kernel");
339327

340-
delete[] RecordedData;
341-
342328
// Verify the replay output if requested.
343329
if (VerifyOpt) {
344330
if (Outcome.OutputFilepath.empty())

0 commit comments

Comments
 (0)