Skip to content

Commit

Permalink
Allow ncclDebugLog to print messages longer than 1024 bytes
Browse files Browse the repository at this point in the history
  • Loading branch information
nzmsv committed Mar 6, 2024
1 parent 48bb7fe commit 0c448a2
Showing 1 changed file with 14 additions and 7 deletions.
21 changes: 14 additions & 7 deletions src/debug.cc
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
va_start(vargs, fmt);
(void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs);
va_end(vargs);
// Make sure ncclLastError is always null-terminated
ncclLastError[sizeof(ncclLastError)-1] = '\0';
pthread_mutex_unlock(&ncclDebugLock);
}
if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;
Expand Down Expand Up @@ -185,17 +187,22 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ",
hostname, pid, tid, cudaDev, timestamp, filefunc, line);
}

if (len > sizeof(buffer)) {
len = sizeof(buffer);
}
if (len) {
len += snprintf(buffer+len, sizeof(buffer)-len, "%s\n", fmt);
// vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
// Rewind len so that we can replace the final \0 by \n
if (len > sizeof(buffer)-1) {
len = sizeof(buffer)-1;
}
buffer[len-1] = '\n';
buffer[len] = '\0';
va_list vargs;
va_start(vargs, fmt);
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
vfprintf(ncclDebugFile, buffer, vargs);
va_end(vargs);
// vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
// Rewind len so that we can replace the final \0 by \n
if (len > sizeof(buffer)) len = sizeof(buffer)-1;
buffer[len++] = '\n';
fwrite(buffer, 1, len, ncclDebugFile);
}
}

Expand Down

0 comments on commit 0c448a2

Please sign in to comment.