Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
<InternalsVisibleTo Include="dotnet-monitor" />
<InternalsVisibleTo Include="dotnet-trace" />
<InternalsVisibleTo Include="dotnet-dump" />
<InternalsVisibleTo Include="dotnet-stack" />
<InternalsVisibleTo Include="Microsoft.Diagnostics.Monitoring" />
<InternalsVisibleTo Include="Microsoft.Diagnostics.Monitoring.EventPipe" />
<InternalsVisibleTo Include="Microsoft.Diagnostics.WebSocketServer" />
Expand Down
219 changes: 143 additions & 76 deletions src/Tools/dotnet-stack/ReportCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,44 @@ internal static class ReportCommandHandler
/// <param name="processId">The process to report the stack from.</param>
/// <param name="name">The name of process to report the stack from.</param>
/// <param name="duration">The duration of to trace the target for. </param>
/// <param name="diagnosticPort">The diagnostic port to connect to.</param>
/// <returns></returns>
private static async Task<int> Report(CancellationToken ct, TextWriter stdOutput, TextWriter stdError, int processId, string name, TimeSpan duration)
private static async Task<int> Report(CancellationToken ct, TextWriter stdOutput, TextWriter stdError, int processId, string name, TimeSpan duration, string diagnosticPort)
{
string tempNetTraceFilename = Path.Join(Path.GetTempPath(), Path.GetRandomFileName() + ".nettrace");
string tempEtlxFilename = "";

try
{
// Either processName or processId has to be specified.
// Validate that only one of processId, name, or diagnosticPort is specified
int optionCount = 0;
if (processId != 0)
{
optionCount++;
}
if (!string.IsNullOrEmpty(name))
{
optionCount++;
}
if (!string.IsNullOrEmpty(diagnosticPort))
{
optionCount++;
}

if (optionCount == 0)
{
stdError.WriteLine("--process-id, --name, or --diagnostic-port is required");
return -1;
}
else if (optionCount > 1)
{
stdError.WriteLine("Only one of --process-id, --name, or --diagnostic-port can be specified");
return -1;
}

// Resolve process name to ID if needed
if (!string.IsNullOrEmpty(name))
{
if (processId != 0)
{
Console.WriteLine("Can only specify either --name or --process-id option.");
return -1;
}
processId = CommandUtils.FindProcessIdWithName(name);
if (processId < 0)
{
Expand All @@ -55,91 +77,125 @@ private static async Task<int> Report(CancellationToken ct, TextWriter stdOutput
stdError.WriteLine("Process ID should not be negative.");
return -1;
}
else if (processId == 0)

DiagnosticsClientBuilder builder = new("dotnet-stack", 10);
using (DiagnosticsClientHolder holder = await builder.Build(ct, processId, diagnosticPort, showChildIO: false, printLaunchCommand: false).ConfigureAwait(false))
{
stdError.WriteLine("--process-id is required");
return -1;
}
if (holder == null)
{
return -1;
}

DiagnosticsClient client = holder.Client;

DiagnosticsClient client = new(processId);
List<EventPipeProvider> providers = new()
{
new EventPipeProvider("Microsoft-DotNETCore-SampleProfiler", EventLevel.Informational)
};

// collect a *short* trace with stack samples
// the hidden '--duration' flag can increase the time of this trace in case 10ms
// is too short in a given environment, e.g., resource constrained systems
// N.B. - This trace INCLUDES rundown. For sufficiently large applications, it may take non-trivial time to collect
// the symbol data in rundown.
EventPipeSession session = await client.StartEventPipeSessionAsync(providers, requestRundown:true, token:ct).ConfigureAwait(false);
using (session)
using (FileStream fs = File.OpenWrite(tempNetTraceFilename))
{
Task copyTask = session.EventStream.CopyToAsync(fs, ct);
await Task.Delay(duration, ct).ConfigureAwait(false);
await session.StopAsync(ct).ConfigureAwait(false);

// check if rundown is taking more than 5 seconds and add comment to report
Task timeoutTask = Task.Delay(TimeSpan.FromSeconds(5));
Task completedTask = await Task.WhenAny(copyTask, timeoutTask).ConfigureAwait(false);
if (completedTask == timeoutTask)
// Resume runtime if it was suspended (similar to --resume-runtime:true in other tools)
// This is safe to call even if the runtime wasn't suspended - it's a no-op in that case
try
{
stdOutput.WriteLine($"# Sufficiently large applications can cause this reportCommand to take non-trivial amounts of time");
await client.ResumeRuntimeAsync(ct).ConfigureAwait(false);
}
catch (Exception)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its a little confusing that if it we deemed it safe to call ResumeRuntimeAsync, that we would expect exceptions. If the runtime was suspended and we hit an exception, we are now ignoring it, is that right?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, we probably want to follow the precedent established by other tools:
dotnet-trace

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot - please follow the Exception handling pattern used by dotnet-trace for this call as shown here: https://github.com/dotnet/diagnostics/blob/main/src/Tools/dotnet-trace/CommandLine/Commands/CollectCommand.cs#L341-L348

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done! Updated the exception handling to follow the dotnet-trace pattern. Now only catching UnsupportedCommandException for the ResumeRuntimeAsync call, and removed the overly broad try-catch blocks that were hiding diagnostic information. Exceptions now bubble up with full stack traces. Commit f3a8c2d.

{
// ResumeRuntime is a no-op if the runtime wasn't suspended,
// so we can safely ignore exceptions here
}
await copyTask.ConfigureAwait(false);
}

// using the generated trace file, symbolicate and compute stacks.
tempEtlxFilename = TraceLog.CreateFromEventPipeDataFile(tempNetTraceFilename);
using (SymbolReader symbolReader = new(TextWriter.Null) { SymbolPath = SymbolPath.MicrosoftSymbolServerPath })
using (TraceLog eventLog = new(tempEtlxFilename))
{
MutableTraceEventStackSource stackSource = new(eventLog)
List<EventPipeProvider> providers = new()
{
OnlyManagedCodeStacks = true
new EventPipeProvider("Microsoft-DotNETCore-SampleProfiler", EventLevel.Informational)
};

SampleProfilerThreadTimeComputer computer = new(eventLog, symbolReader);
computer.GenerateThreadTimeStacks(stackSource);

Dictionary<int, List<StackSourceSample>> samplesForThread = new();
// collect a *short* trace with stack samples
// the hidden '--duration' flag can increase the time of this trace in case 10ms
// is too short in a given environment, e.g., resource constrained systems
// N.B. - This trace INCLUDES rundown. For sufficiently large applications, it may take non-trivial time to collect
// the symbol data in rundown.
EventPipeSession session;
try
{
session = await client.StartEventPipeSessionAsync(providers, requestRundown:true, token:ct).ConfigureAwait(false);
}
catch (Exception)
{
stdError.WriteLine(EventPipeErrorMessage);
return -1;
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are now overriding all exceptions and their stacktraces with a blanket exception message, is that better?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's an improvement over current state. I don't think 'read past the end of the stream' gives us enough to tell what was wrong.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this particular API throw the EndOfStreamException? I wouldn't have expected it to. This API starts the session but the only thing it reads is the immediate reply from the runtime acknowledging the session start request.

I think there are a couple potential issues here:

  1. If StartEventPipeSessionAsync throws EndOfStreamException that sounds like a bug in StartEventPipeSessionAsync we should fix. We expect those APIs to throw something derived from DiagnosticsClientException.
  2. If there are bugs in DiagnosticClient (or elsewhere) we'd like to get as much info as possible to help us solve them when users report the problem back to us.

I think the ideal fix here is to determine the root cause of the EndOfStreamException assuming we can repro it. If that isn't possible then we probably want to do something that both helps users get a vague idea what went wrong while still including the exact Exception stack trace + Exception message to help us diagnose the bug when it occurs in the wild.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, we should hunt down what triggers the error and either make a more graceful handling of it and/or report the error better. Handling all exceptions the same way is probably to limiting and might hide valueable details if we hit cases when don't have good error handling for. dotnet-stack runs sample profiler for 10 ms and then run logic to analyze the samples, so maybe there is errors in its logic in case no data is coming back from sample profiler that could be detected differently. It also waits 5 seconds for rundown to complete, not sure how that works when the runtime is suspended.


stackSource.ForEach((sample) => {
StackSourceCallStackIndex stackIndex = sample.StackIndex;
while (!stackSource.GetFrameName(stackSource.GetFrameIndex(stackIndex), false).StartsWith("Thread ("))
try
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not quite following, what was the rationale for adding try/catch for these three blocks (this plus the two above). Also couldn't this and the above be merged under the same try/catch, they seem like the same format

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is broad catch handler here that serves as our fallback for unanticipated errors. Inside that scope we generally only catch narrower Exception types that we know how to handle or how to describe to the user in a meaningful way.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot, please remove this try catch handler and let Exceptions flow into the outer catch handler.

{
using (session)
using (FileStream fs = File.OpenWrite(tempNetTraceFilename))
{
stackIndex = stackSource.GetCallerIndex(stackIndex);
Task copyTask = session.EventStream.CopyToAsync(fs, ct);
await Task.Delay(duration, ct).ConfigureAwait(false);
await session.StopAsync(ct).ConfigureAwait(false);

// check if rundown is taking more than 5 seconds and add comment to report
Task timeoutTask = Task.Delay(TimeSpan.FromSeconds(5));
Task completedTask = await Task.WhenAny(copyTask, timeoutTask).ConfigureAwait(false);
if (completedTask == timeoutTask)
{
stdOutput.WriteLine($"# Sufficiently large applications can cause this reportCommand to take non-trivial amounts of time");
}
await copyTask.ConfigureAwait(false);
}
}
catch (Exception)
{
stdError.WriteLine(EventPipeErrorMessage);
return -1;
}

// long form for: int.Parse(threadFrame["Thread (".Length..^1)])
// Thread id is in the frame name as "Thread (<ID>)"
string template = "Thread (";
string threadFrame = stackSource.GetFrameName(stackSource.GetFrameIndex(stackIndex), false);
// using the generated trace file, symbolicate and compute stacks.
tempEtlxFilename = TraceLog.CreateFromEventPipeDataFile(tempNetTraceFilename);
using (SymbolReader symbolReader = new(TextWriter.Null) { SymbolPath = SymbolPath.MicrosoftSymbolServerPath })
using (TraceLog eventLog = new(tempEtlxFilename))
{
MutableTraceEventStackSource stackSource = new(eventLog)
{
OnlyManagedCodeStacks = true
};

// we are looking for the first index of ) because
// we need to handle a thread name like: Thread (4008) (.NET IO ThreadPool Worker)
int firstIndex = threadFrame.IndexOf(')');
int threadId = int.Parse(threadFrame.AsSpan(template.Length, firstIndex - template.Length));
SampleProfilerThreadTimeComputer computer = new(eventLog, symbolReader);
computer.GenerateThreadTimeStacks(stackSource);

if (samplesForThread.TryGetValue(threadId, out List<StackSourceSample> samples))
{
samples.Add(sample);
}
else
{
samplesForThread[threadId] = new List<StackSourceSample>() { sample };
}
});
Dictionary<int, List<StackSourceSample>> samplesForThread = new();

// For every thread recorded in our trace, print the first stack
foreach ((int threadId, List<StackSourceSample> samples) in samplesForThread)
{
stackSource.ForEach((sample) => {
StackSourceCallStackIndex stackIndex = sample.StackIndex;
while (!stackSource.GetFrameName(stackSource.GetFrameIndex(stackIndex), false).StartsWith("Thread ("))
{
stackIndex = stackSource.GetCallerIndex(stackIndex);
}

// long form for: int.Parse(threadFrame["Thread (".Length..^1)])
// Thread id is in the frame name as "Thread (<ID>)"
string template = "Thread (";
string threadFrame = stackSource.GetFrameName(stackSource.GetFrameIndex(stackIndex), false);

// we are looking for the first index of ) because
// we need to handle a thread name like: Thread (4008) (.NET IO ThreadPool Worker)
int firstIndex = threadFrame.IndexOf(')');
int threadId = int.Parse(threadFrame.AsSpan(template.Length, firstIndex - template.Length));

if (samplesForThread.TryGetValue(threadId, out List<StackSourceSample> samples))
{
samples.Add(sample);
}
else
{
samplesForThread[threadId] = new List<StackSourceSample>() { sample };
}
});

// For every thread recorded in our trace, print the first stack
foreach ((int threadId, List<StackSourceSample> samples) in samplesForThread)
{
#if DEBUG
stdOutput.WriteLine($"Found {samples.Count} stacks for thread 0x{threadId:X}");
stdOutput.WriteLine($"Found {samples.Count} stacks for thread 0x{threadId:X}");
#endif
PrintStack(stdOutput, threadId, samples[0], stackSource);
PrintStack(stdOutput, threadId, samples[0], stackSource);
}
}
}
}
Expand Down Expand Up @@ -189,15 +245,17 @@ public static Command ReportCommand()
{
ProcessIdOption,
NameOption,
DurationOption
DurationOption,
DiagnosticPortOption
};

reportCommand.SetAction((parseResult, ct) => Report(ct,
stdOutput: parseResult.Configuration.Output,
stdError: parseResult.Configuration.Error,
processId: parseResult.GetValue(ProcessIdOption),
name: parseResult.GetValue(NameOption),
duration: parseResult.GetValue(DurationOption)));
duration: parseResult.GetValue(DurationOption),
diagnosticPort: parseResult.GetValue(DiagnosticPortOption)));

return reportCommand;
}
Expand All @@ -221,5 +279,14 @@ public static Command ReportCommand()
{
Description = "The name of the process to report the stack."
};

private static readonly Option<string> DiagnosticPortOption =
new("--diagnostic-port", "--dport")
{
Description = "The path to a diagnostic port to be used."
};

private const string EventPipeErrorMessage =
"There was a failure in reading stack data. Possible reasons could be trying to connect to a runtime that has been suspended, an unexpected close of the IPC channel, etc.";
}
}
2 changes: 2 additions & 0 deletions src/Tools/dotnet-stack/dotnet-stack.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@
<ItemGroup>
<Compile Include="..\Common\Commands\ProcessStatus.cs" Link="ProcessStatus.cs" />
<Compile Include="..\Common\Commands\Utils.cs" Link="Utils.cs" />
<Compile Include="..\Common\ReversedServerHelpers\ReversedServerHelpers.cs" Link="ReversedServerHelpers.cs" />
<Compile Include="..\Common\ProcessNativeMethods\ProcessNativeMethods.cs" Link="ProcessNativeMethods.cs" />
<Compile Include="..\Common\WindowsProcessExtension\WindowsProcessExtension.cs" Link="WindowsProcessExtension.cs" />
<Compile Include="..\Common\CommandLineErrorException.cs" Link="CommandLineErrorException.cs" />
<Compile Include="..\Common\DsRouterProcessLauncher.cs" Link="DsRouterProcessLauncher.cs" />
<Compile Include="..\Common\IConsole.cs" Link="IConsole.cs" />
</ItemGroup>
Expand Down
21 changes: 21 additions & 0 deletions src/tests/dotnet-stack/StackTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -100,5 +100,26 @@ public async Task ReportsStacksCorrectly(TestConfiguration config)
Assert.True(correctStackParts[j] == stackParts[i], $"{correctStackParts[j]} != {stackParts[i]}");
}
}

[Fact]
public void DiagnosticPortOptionIsRegistered()
{
Command reportCommand = ReportCommandHandler.ReportCommand();

// Verify the diagnostic port option is registered
bool hasDiagnosticPortOption = false;
foreach (var option in reportCommand.Options)
{
if (option.Name == "diagnostic-port")
{
hasDiagnosticPortOption = true;
// Verify it has the short alias --dport
Assert.Contains("--dport", option.Aliases);
break;
}
}

Assert.True(hasDiagnosticPortOption, "The --diagnostic-port option should be registered in the report command");
}
}
}