Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Example / proof of concept to achieve a combination of head-based sampling + a basic form of tail-based sampling at a span level. #4206

Merged
merged 16 commits into from
Mar 10, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions OpenTelemetry.sln
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "getting-started-console", "
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "getting-started-jaeger", "docs\trace\getting-started-jaeger\getting-started-jaeger.csproj", "{A0C0B77C-6C7B-4EC2-AC61-EA1F489811B9}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "tail-based-sampling-example", "docs\trace\tail-based-sampling-span-level\tail-based-sampling-example.csproj", "{800DB925-6014-4136-AC01-3356CF7CADD3}"
kalyanaj marked this conversation as resolved.
Show resolved Hide resolved
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "stratified-sampling-example", "docs\trace\advanced\stratified-sampling-example\stratified-sampling-example.csproj", "{9C99621C-343E-479C-A943-332DB6129B71}"
EndProject
Global
Expand Down Expand Up @@ -525,6 +527,10 @@ Global
{A0C0B77C-6C7B-4EC2-AC61-EA1F489811B9}.Debug|Any CPU.Build.0 = Debug|Any CPU
{A0C0B77C-6C7B-4EC2-AC61-EA1F489811B9}.Release|Any CPU.ActiveCfg = Release|Any CPU
{A0C0B77C-6C7B-4EC2-AC61-EA1F489811B9}.Release|Any CPU.Build.0 = Release|Any CPU
{800DB925-6014-4136-AC01-3356CF7CADD3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{800DB925-6014-4136-AC01-3356CF7CADD3}.Debug|Any CPU.Build.0 = Debug|Any CPU
{800DB925-6014-4136-AC01-3356CF7CADD3}.Release|Any CPU.ActiveCfg = Release|Any CPU
{800DB925-6014-4136-AC01-3356CF7CADD3}.Release|Any CPU.Build.0 = Release|Any CPU
{9C99621C-343E-479C-A943-332DB6129B71}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{9C99621C-343E-479C-A943-332DB6129B71}.Debug|Any CPU.Build.0 = Debug|Any CPU
{9C99621C-343E-479C-A943-332DB6129B71}.Release|Any CPU.ActiveCfg = Release|Any CPU
Expand Down Expand Up @@ -568,6 +574,7 @@ Global
{DEDE8442-03CA-48CF-99B9-EA224D89D148} = {5B7FB835-3FFF-4BC2-99C5-A5B5FAE3C818}
{EF4F6280-14D1-49D4-8095-1AC36E169AA8} = {5B7FB835-3FFF-4BC2-99C5-A5B5FAE3C818}
{A0C0B77C-6C7B-4EC2-AC61-EA1F489811B9} = {5B7FB835-3FFF-4BC2-99C5-A5B5FAE3C818}
{800DB925-6014-4136-AC01-3356CF7CADD3} = {5B7FB835-3FFF-4BC2-99C5-A5B5FAE3C818}
{9C99621C-343E-479C-A943-332DB6129B71} = {5B7FB835-3FFF-4BC2-99C5-A5B5FAE3C818}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// <copyright file="ParentBasedElseAlwaysRecordSampler.cs" company="OpenTelemetry Authors">
// Copyright The OpenTelemetry Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// </copyright>

using OpenTelemetry.Trace;

namespace SDKBasedSpanLevelTailSamplingSample;

/// <summary>
/// Note: This is a proof-of-concept and is not meant to be used directly in production.
/// This is a composite sampler used to achieve a combination of parent-based sampling
/// and SDK-side "span-level" tail-based sampling.
/// It first invokes a head-sampling mechanism using the parent based sampling approach.
/// If the parent based sampler's decision is to sample it (i.e., record and export the span),
/// it retains that decision. If not, it returns a "record-only" sampling result that can be
/// changed later by a span processor based on span attributes (e.g., failure) that become
/// available only by the end of the span.
/// </summary>
internal class ParentBasedElseAlwaysRecordSampler : Sampler
{
private const double DefaultSamplingProbabilityForRootSpan = 0.1;
private readonly ParentBasedSampler parentBasedSampler;

public ParentBasedElseAlwaysRecordSampler(double samplingProbabilityForRootSpan = DefaultSamplingProbabilityForRootSpan)
{
this.parentBasedSampler = new ParentBasedSampler(new TraceIdRatioBasedSampler(samplingProbabilityForRootSpan));
}

public override SamplingResult ShouldSample(in SamplingParameters samplingParameters)
{
// First, let's sample using the parentbased sampler.
var samplingResult = this.parentBasedSampler.ShouldSample(samplingParameters);

if (samplingResult.Decision != SamplingDecision.Drop)
{
// Parentbased sampler decided not to drop it, so we will sample this.
return samplingResult;
}

// Parentbased sampler decided to drop it. We will return a RecordOnly
// decision so that the span filtering processors later in the pipeline
// can apply tailbased sampling rules (e.g., to sample all failed spans).
// Returning a RecordOnly decision is relevant because:
// 1. It causes the Processor pipeline to be invoked.
// 2. It causes activity.IsAllDataRequested to return true, so most
// instrumentations end up populating the required attributes.
return new SamplingResult(SamplingDecision.RecordOnly);
}
}
59 changes: 59 additions & 0 deletions docs/trace/tail-based-sampling-span-level/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// <copyright file="Program.cs" company="OpenTelemetry Authors">
// Copyright The OpenTelemetry Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// </copyright>

using System.Diagnostics;
using OpenTelemetry;
using OpenTelemetry.Trace;

namespace SDKBasedSpanLevelTailSamplingSample;

internal class Program
{
private static readonly ActivitySource MyActivitySource = new("SDK.TailSampling.POC");

public static void Main(string[] args)
{
using var tracerProvider = Sdk.CreateTracerProviderBuilder()
.SetSampler(new ParentBasedElseAlwaysRecordSampler())
.AddSource("SDK.TailSampling.POC")
.AddProcessor(new TailSamplingProcessor())
.AddConsoleExporter()
.Build();

var random = new Random(2357);

// Generate some spans
for (var i = 0; i < 50; i++)
{
using (var activity = MyActivitySource.StartActivity("SayHello"))
{
activity?.SetTag("foo", "bar");

// Simulate a mix of failed and successful spans
var randomValue = random.Next(5);
switch (randomValue)
{
case 0:
activity?.SetStatus(ActivityStatusCode.Error);
break;
default:
activity?.SetStatus(ActivityStatusCode.Ok);
break;
}
}
}
}
}
103 changes: 103 additions & 0 deletions docs/trace/tail-based-sampling-span-level/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Tail Based Sampling at a span level: An Example

This is an example / proof of concept to achieve tail-based sampling at a
kalyanaj marked this conversation as resolved.
Show resolved Hide resolved
span level using the extensibility mechanisms in OpenTelemetry.NET.

This is a way to achieve a combination of:

- Head-based sampling (based on probabilistic sampling) and
cijothomas marked this conversation as resolved.
Show resolved Hide resolved
- Tail-based sampling to get all failure spans (non-probabilistic sampling).

## How does this sampling example work?

We use a hybrid approach: we do head based sampling to get a
probabilistic subset of all spans which includes both successful spans
kalyanaj marked this conversation as resolved.
Show resolved Hide resolved
and failure spans. In addition, we want to capture all failure spans.
To do this, if the parent based sampler's decision is to drop it, we return
a "Record-Only" sampling result. This ensures that the Span processor
receives that span. In the span processor, at the end of a span, we check if
it is a failure span. If so, we change the decision from "Record-Only"
to set the sampled flag so that the exporter receives the span.
In this example, each span is filtered individually without consideration to any
other spans.

This is a basic form of tail-based sampling at a span level. If a span failed,
we always sample it in addition to all head-sampled spans.

## When should you consider such an option?

This is a good option if you want to get all failure spans in addition to
head based sampling. With this, you get basic span level tail-based sampling
at a SDK level without having to install any additional components.

## Tradeoffs

Tail-sampling this way involves many tradeoffs such as:

1. Additional memory cost: Unlike head-based sampling where the sampling
kalyanaj marked this conversation as resolved.
Show resolved Hide resolved
decision is made at span creation time, in tail sampling the decision is made
only at the end, so there is additional memory cost.
kalyanaj marked this conversation as resolved.
Show resolved Hide resolved

2. Partial traces: Since this sampling is at a span level, the generated trace
will be partial. For example, if another part of the call tree is successful,
those spans may not be exported leading to an incomplete trace.

3. If multiple exporters are used, this decision will impact all of them:
[Issue 3861](https:/open-telemetry/opentelemetry-dotnet/issues/3861).

## Sample Output

You should see output such as the below when you run this example.

```text
Including error span with id
00-404ddff248b8f9a9b21e347d68d2640e-035858bc3c168885-01 and status Error
Activity.TraceId: 404ddff248b8f9a9b21e347d68d2640e
Activity.SpanId: 035858bc3c168885
Activity.TraceFlags: Recorded
Activity.ActivitySourceName: SDK.TailSampling.POC
Activity.DisplayName: SayHello
Activity.Kind: Internal
Activity.StartTime: 2023-02-09T19:05:32.5563112Z
Activity.Duration: 00:00:00.0028144
Activity.Tags:
foo: bar
StatusCode: Error
Resource associated with Activity:
service.name: unknown_service:Examples.TailBasedSamplingAtSpanLevel

Dropping span with id 00-ea861bda268c58d328ab7cbe49851499-daba29055de80a53-00
and status Ok

Including error span with id
00-802dea991247e2d699d943167eb546de-cc120b0bd1741b52-01 and status Error
Activity.TraceId: 802dea991247e2d699d943167eb546de
Activity.SpanId: cc120b0bd1741b52
Activity.TraceFlags: Recorded
Activity.ActivitySourceName: SDK.TailSampling.POC
Activity.DisplayName: SayHello
Activity.Kind: Internal
Activity.StartTime: 2023-02-09T19:05:32.7021138Z
Activity.Duration: 00:00:00.0000012
Activity.Tags:
foo: bar
StatusCode: Error
Resource associated with Activity:
service.name: unknown_service:Examples.TailBasedSamplingAtSpanLevel

Including head-sampled span with id
00-f3c88010615e285c8f3cb3e2bcd70c7f-f9316215f12437c3-01 and status Ok
Activity.TraceId: f3c88010615e285c8f3cb3e2bcd70c7f
Activity.SpanId: f9316215f12437c3
Activity.TraceFlags: Recorded
Activity.ActivitySourceName: SDK.TailSampling.POC
Activity.DisplayName: SayHello
Activity.Kind: Internal
Activity.StartTime: 2023-02-09T19:05:32.8519346Z
Activity.Duration: 00:00:00.0000034
Activity.Tags:
foo: bar
StatusCode: Ok
Resource associated with Activity:
service.name: unknown_service:Examples.TailBasedSamplingAtSpanLevel
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// <copyright file="TailSamplingProcessor.cs" company="OpenTelemetry Authors">
// Copyright The OpenTelemetry Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// </copyright>

using System.Diagnostics;
using OpenTelemetry;

namespace SDKBasedSpanLevelTailSamplingSample;

/// <summary>
/// A custom processor for filtering <see cref="Activity"/> instances.
/// </summary>
internal sealed class TailSamplingProcessor : BaseProcessor<Activity>
{
public TailSamplingProcessor()
: base()
{
}

public override void OnEnd(Activity activity)
{
this.FilterSpan(activity);
base.OnEnd(activity);
}

// Note: This is used to filter spans at the end of a span.
// This is a basic form of tail-based sampling at a span level.
// If a span failed, we always sample it in addition to all head-sampled spans.
// In this example, each span is filtered individually without consideration to any other spans.
// Tail-sampling this way involves many tradeoffs. A few examples of the tradeoffs:
// 1. Performance: Unlike head-based sampling where the sampling decision is made at span creation time, in
// tail sampling the decision is made only at the end, so there is additional memory cost.
// 2. Traces will not be complete: Since this sampling is at a span level, the generated trace will be partial and won't be complete.
// For example, if another part of the call tree is successful, those spans may not be sampled in leading to a partial trace.
// 3. If multiple exporters are used, this decision will impact all of them: https:/open-telemetry/opentelemetry-dotnet/issues/3861.
private void FilterSpan(Activity activity)
{
if (activity.Recorded)
kalyanaj marked this conversation as resolved.
Show resolved Hide resolved
{
// This means that this activity was included based on head-based sampling,
// we continue with that decision and no further change is needed.
Console.WriteLine($"Including head-sampled span with id {activity.Id} and status {activity.Status}");
return;
}

if (activity.Status == ActivityStatusCode.Error)
{
// We decide to always include all the failure spans
// Set the recorded flag so that this will be exported.
activity.ActivityTraceFlags |= ActivityTraceFlags.Recorded;
Console.WriteLine($"Including error span with id {activity.Id} and status {activity.Status}");
return;
}

// This span is not sampled and exporters won't see this span.
Console.WriteLine($"Dropping span with id {activity.Id} and status {activity.Status}");
return;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
kalyanaj marked this conversation as resolved.
Show resolved Hide resolved
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="$(RepoRoot)\src\OpenTelemetry\OpenTelemetry.csproj" />
kalyanaj marked this conversation as resolved.
Show resolved Hide resolved
<ProjectReference Include="$(RepoRoot)\src\OpenTelemetry.Exporter.Console\OpenTelemetry.Exporter.Console.csproj" />

kalyanaj marked this conversation as resolved.
Show resolved Hide resolved
</ItemGroup>

</Project>