Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Example / proof of concept to achieve a combination of head-based sampling + a basic form of tail-based sampling at a span level. #4206

Merged
merged 16 commits into from
Mar 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions OpenTelemetry.sln
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "getting-started-console", "
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "getting-started-jaeger", "docs\trace\getting-started-jaeger\getting-started-jaeger.csproj", "{A0C0B77C-6C7B-4EC2-AC61-EA1F489811B9}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "tail-based-sampling-example", "docs\trace\tail-based-sampling-span-level\tail-based-sampling-example.csproj", "{800DB925-6014-4136-AC01-3356CF7CADD3}"
kalyanaj marked this conversation as resolved.
Show resolved Hide resolved
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "stratified-sampling-example", "docs\trace\stratified-sampling-example\stratified-sampling-example.csproj", "{9C99621C-343E-479C-A943-332DB6129B71}"
EndProject
Global
Expand Down Expand Up @@ -525,6 +527,10 @@ Global
{A0C0B77C-6C7B-4EC2-AC61-EA1F489811B9}.Debug|Any CPU.Build.0 = Debug|Any CPU
{A0C0B77C-6C7B-4EC2-AC61-EA1F489811B9}.Release|Any CPU.ActiveCfg = Release|Any CPU
{A0C0B77C-6C7B-4EC2-AC61-EA1F489811B9}.Release|Any CPU.Build.0 = Release|Any CPU
{800DB925-6014-4136-AC01-3356CF7CADD3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{800DB925-6014-4136-AC01-3356CF7CADD3}.Debug|Any CPU.Build.0 = Debug|Any CPU
{800DB925-6014-4136-AC01-3356CF7CADD3}.Release|Any CPU.ActiveCfg = Release|Any CPU
{800DB925-6014-4136-AC01-3356CF7CADD3}.Release|Any CPU.Build.0 = Release|Any CPU
{9C99621C-343E-479C-A943-332DB6129B71}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{9C99621C-343E-479C-A943-332DB6129B71}.Debug|Any CPU.Build.0 = Debug|Any CPU
{9C99621C-343E-479C-A943-332DB6129B71}.Release|Any CPU.ActiveCfg = Release|Any CPU
Expand Down Expand Up @@ -568,6 +574,7 @@ Global
{DEDE8442-03CA-48CF-99B9-EA224D89D148} = {5B7FB835-3FFF-4BC2-99C5-A5B5FAE3C818}
{EF4F6280-14D1-49D4-8095-1AC36E169AA8} = {5B7FB835-3FFF-4BC2-99C5-A5B5FAE3C818}
{A0C0B77C-6C7B-4EC2-AC61-EA1F489811B9} = {5B7FB835-3FFF-4BC2-99C5-A5B5FAE3C818}
{800DB925-6014-4136-AC01-3356CF7CADD3} = {5B7FB835-3FFF-4BC2-99C5-A5B5FAE3C818}
{9C99621C-343E-479C-A943-332DB6129B71} = {5B7FB835-3FFF-4BC2-99C5-A5B5FAE3C818}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// <copyright file="ParentBasedElseAlwaysRecordSampler.cs" company="OpenTelemetry Authors">
// Copyright The OpenTelemetry Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// </copyright>

using OpenTelemetry.Trace;

namespace SDKBasedSpanLevelTailSamplingSample;

/// <summary>
/// Note: This is a proof-of-concept and is not meant to be used directly in production.
/// This is a composite sampler used to achieve a combination of parent-based sampling
/// and SDK-side "span-level" tail-based sampling.
/// It first invokes a head-sampling mechanism using the parent based sampling approach.
/// If the parent based sampler's decision is to sample it (i.e., record and export the span),
/// it retains that decision. If not, it returns a "record-only" sampling result that can be
/// changed later by a span processor based on span attributes (e.g., failure) that become
/// available only by the end of the span.
/// </summary>
internal class ParentBasedElseAlwaysRecordSampler : Sampler
{
private const double DefaultSamplingProbabilityForRootSpan = 0.1;
private readonly ParentBasedSampler parentBasedSampler;

public ParentBasedElseAlwaysRecordSampler(double samplingProbabilityForRootSpan = DefaultSamplingProbabilityForRootSpan)
{
this.parentBasedSampler = new ParentBasedSampler(new TraceIdRatioBasedSampler(samplingProbabilityForRootSpan));
}

public override SamplingResult ShouldSample(in SamplingParameters samplingParameters)
{
// First, let's sample using the parentbased sampler.
var samplingResult = this.parentBasedSampler.ShouldSample(samplingParameters);

if (samplingResult.Decision != SamplingDecision.Drop)
{
// Parentbased sampler decided not to drop it, so we will sample this.
return samplingResult;
}

// Parentbased sampler decided to drop it. We will return a RecordOnly
// decision so that the span filtering processors later in the pipeline
// can apply tailbased sampling rules (e.g., to sample all failed spans).
// Returning a RecordOnly decision is relevant because:
// 1. It causes the Processor pipeline to be invoked.
// 2. It causes activity.IsAllDataRequested to return true, so most
// instrumentations end up populating the required attributes.
return new SamplingResult(SamplingDecision.RecordOnly);
}
}
59 changes: 59 additions & 0 deletions docs/trace/tail-based-sampling-span-level/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// <copyright file="Program.cs" company="OpenTelemetry Authors">
// Copyright The OpenTelemetry Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// </copyright>

using System.Diagnostics;
using OpenTelemetry;
using OpenTelemetry.Trace;

namespace SDKBasedSpanLevelTailSamplingSample;

internal class Program
{
private static readonly ActivitySource MyActivitySource = new("SDK.TailSampling.POC");

public static void Main(string[] args)
{
using var tracerProvider = Sdk.CreateTracerProviderBuilder()
.SetSampler(new ParentBasedElseAlwaysRecordSampler())
.AddSource("SDK.TailSampling.POC")
.AddProcessor(new TailSamplingProcessor())
.AddConsoleExporter()
.Build();

var random = new Random(2357);

// Generate some spans
for (var i = 0; i < 50; i++)
{
using (var activity = MyActivitySource.StartActivity("SayHello"))
{
activity?.SetTag("foo", "bar");

// Simulate a mix of failed and successful spans
var randomValue = random.Next(5);
switch (randomValue)
{
case 0:
activity?.SetStatus(ActivityStatusCode.Error);
break;
default:
activity?.SetStatus(ActivityStatusCode.Ok);
break;
}
}
}
}
}
108 changes: 108 additions & 0 deletions docs/trace/tail-based-sampling-span-level/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Tail Based Sampling at an activity (span) level: An Example

This document describes one possible way to achieve a form of tail-based
sampling to include all failed activities in addition to head-based sampling.

It does this by leveraging the extensibility mechanisms in the OpenTelemetry
SDK. It uses a combination of a custom sampler and an ActivityProcessor
(span processor).

This is a way to achieve a combination of:

- Head-based sampling (probabilistic/unbiased sampling), and
- Tail-based sampling (a non-probabilistic/biased sampling).

## How does this sampling example work?

We use a hybrid approach: we do head based sampling to get a
probabilistic subset of all activities which includes both successful activities
and failure activities. In addition, we want to capture all failure activities.
To do this, if the parent based sampler's decision is to drop it, we return
a "Record-Only" sampling result. This ensures that the activity processor
receives that activity. In the activity processor, at the end of an activity,
we check if it is a failure activity. If so, we change the decision from
"Record-Only" to set the sampled flag so that the exporter receives the
activity. In this example, each activity is filtered individually without
consideration to any other activities.

This is a basic form of tail-based sampling at an activity level. If an
activity failed, we always sample it in addition to all head-sampled
activities.

## When should you consider such an option?

This is a good option if you want to get all failure activities in addition to
head based sampling. With this, you get basic activity level tail-based sampling
at a SDK level without having to install any additional components.

## Tradeoffs

Tail-sampling this way involves many tradeoffs such as:

1. Additional performance cost: Unlike head-based sampling where the sampling
decision is made at activity creation time, in tail sampling the decision is made
only at the end, so there is additional memory/processing cost.

2. Partial traces: Since this sampling is at a activity level, the generated trace
will be partial. For example, if another part of the call tree is successful,
those activities may not be exported leading to an incomplete trace.

3. If multiple exporters are used, this decision will impact all of them:
[Issue 3861](https:/open-telemetry/opentelemetry-dotnet/issues/3861).

## Sample Output

You should see output such as the below when you run this example.

```text
Including error activity with id
00-404ddff248b8f9a9b21e347d68d2640e-035858bc3c168885-01 and status Error
Activity.TraceId: 404ddff248b8f9a9b21e347d68d2640e
Activity.SpanId: 035858bc3c168885
Activity.TraceFlags: Recorded
Activity.ActivitySourceName: SDK.TailSampling.POC
Activity.DisplayName: SayHello
Activity.Kind: Internal
Activity.StartTime: 2023-02-09T19:05:32.5563112Z
Activity.Duration: 00:00:00.0028144
Activity.Tags:
foo: bar
StatusCode: Error
Resource associated with Activity:
service.name: unknown_service:Examples.TailBasedSamplingAtSpanLevel

Dropping activity with id 00-ea861bda268c58d328ab7cbe49851499-daba29055de80a53-00
and status Ok

Including error activity with id
00-802dea991247e2d699d943167eb546de-cc120b0bd1741b52-01 and status Error
Activity.TraceId: 802dea991247e2d699d943167eb546de
Activity.SpanId: cc120b0bd1741b52
Activity.TraceFlags: Recorded
Activity.ActivitySourceName: SDK.TailSampling.POC
Activity.DisplayName: SayHello
Activity.Kind: Internal
Activity.StartTime: 2023-02-09T19:05:32.7021138Z
Activity.Duration: 00:00:00.0000012
Activity.Tags:
foo: bar
StatusCode: Error
Resource associated with Activity:
service.name: unknown_service:Examples.TailBasedSamplingAtSpanLevel

Including head-sampled activity with id
00-f3c88010615e285c8f3cb3e2bcd70c7f-f9316215f12437c3-01 and status Ok
Activity.TraceId: f3c88010615e285c8f3cb3e2bcd70c7f
Activity.SpanId: f9316215f12437c3
Activity.TraceFlags: Recorded
Activity.ActivitySourceName: SDK.TailSampling.POC
Activity.DisplayName: SayHello
Activity.Kind: Internal
Activity.StartTime: 2023-02-09T19:05:32.8519346Z
Activity.Duration: 00:00:00.0000034
Activity.Tags:
foo: bar
StatusCode: Ok
Resource associated with Activity:
service.name: unknown_service:Examples.TailBasedSamplingAtSpanLevel
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// <copyright file="TailSamplingProcessor.cs" company="OpenTelemetry Authors">
// Copyright The OpenTelemetry Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// </copyright>

using System.Diagnostics;
using OpenTelemetry;

namespace SDKBasedSpanLevelTailSamplingSample;

/// <summary>
/// A custom processor for filtering <see cref="Activity"/> instances.
/// </summary>
internal sealed class TailSamplingProcessor : BaseProcessor<Activity>
{
public TailSamplingProcessor()
: base()
{
}

public override void OnEnd(Activity activity)
{
if (activity.Recorded)
{
// This means that this activity was included based on head-based sampling,
// we continue with that decision and no further change is needed.
Console.WriteLine($"Including head-sampled activity with id {activity.Id} and status {activity.Status}");
}
else
{
this.IncludeForExportIfFailedActivity(activity);
}

base.OnEnd(activity);
}

// Note: This is used to filter spans at the end of a span.
// This is a basic form of tail-based sampling at a span level.
// If a span failed, we always sample it in addition to all head-sampled spans.
// In this example, each span is filtered individually without consideration to any other spans.
// Tail-sampling this way involves many tradeoffs. A few examples of the tradeoffs:
// 1. Performance: Unlike head-based sampling where the sampling decision is made at span creation time, in
// tail sampling the decision is made only at the end, so there is additional memory cost.
// 2. Traces will not be complete: Since this sampling is at a span level, the generated trace will be partial and won't be complete.
// For example, if another part of the call tree is successful, those spans may not be sampled in leading to a partial trace.
// 3. If multiple exporters are used, this decision will impact all of them: https:/open-telemetry/opentelemetry-dotnet/issues/3861.
private void IncludeForExportIfFailedActivity(Activity activity)
{
if (activity.Status == ActivityStatusCode.Error)
{
// We decide to always include all the failure spans
// Set the recorded flag so that this will be exported.
activity.ActivityTraceFlags |= ActivityTraceFlags.Recorded;
Console.WriteLine($"Including error activity with id {activity.Id} and status {activity.Status}");
}
else
{
// This span is not sampled and exporters won't see this span.
Console.WriteLine($"Dropping activity with id {activity.Id} and status {activity.Status}");
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<Project Sdk="Microsoft.NET.Sdk">
<ItemGroup>
<ProjectReference Include="$(RepoRoot)\src\OpenTelemetry.Exporter.Console\OpenTelemetry.Exporter.Console.csproj" />
</ItemGroup>
</Project>