Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Propagate context through TCP packets #1161

Merged
merged 6 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions bpf/http_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,13 @@ const u8 ip4ip6_prefix[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff};

#ifdef BPF_DEBUG
static __always_inline void dbg_print_http_connection_info(connection_info_t *info) {
bpf_dbg_printk("[http] s_h = %llx, s_l = %llx, d_h = %llx, d_l = %llx, s_port=%d, d_port=%d",
bpf_dbg_printk("[conn] s_h = %llx, s_l = %llx, s_port=%d",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I split this so that we can use it on older kernels where bpf_printk allowed for at most 3 arguments.

*(u64 *)(&info->s_addr),
*(u64 *)(&info->s_addr[8]),
info->s_port);
bpf_dbg_printk("[conn] d_h = %llx, d_l = %llx, d_port=%d",
*(u64 *)(&info->d_addr),
*(u64 *)(&info->d_addr[8]),
info->s_port,
info->d_port);
}
#else
Expand Down
78 changes: 62 additions & 16 deletions bpf/k_tracer.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,33 @@ int BPF_KPROBE(kprobe_tcp_connect, struct sock *sk) {

bpf_dbg_printk("=== tcp connect %llx ===", id);

tp_info_pid_t *tp_p = tp_buf();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Connect runs before the SYN packet is sent. We use this opportunity to setup a trace context information for the connection. We'll later query the trace information in tc_egress, and serialize it on the TCP packet.


if (tp_p) {
tp_p->tp.ts = bpf_ktime_get_ns();
tp_p->tp.flags = 1;
tp_p->valid = 1;
tp_p->pid = 0;
urand_bytes(tp_p->tp.span_id, SPAN_ID_SIZE_BYTES);
tp_info_pid_t *server_tp = find_parent_trace();
if (server_tp && valid_trace(server_tp->tp.trace_id)) {
__builtin_memcpy(tp_p->tp.trace_id, server_tp->tp.trace_id, sizeof(tp_p->tp.trace_id));
__builtin_memcpy(tp_p->tp.parent_id, server_tp->tp.span_id, sizeof(tp_p->tp.parent_id));
} else {
urand_bytes(tp_p->tp.trace_id, TRACE_ID_SIZE_BYTES);
__builtin_memset(tp_p->tp.parent_id, 0, sizeof(tp_p->tp.span_id));
}

connection_info_t conn = {};
parse_sock_info(sk, &conn);
sort_connection_info(&conn);

bpf_dbg_printk("Setting up tp info");
dbg_print_http_connection_info(&conn);

bpf_map_update_elem(&outgoing_trace_map, &conn, tp_p, BPF_ANY);
}

u64 addr = (u64)sk;

sock_args_t args = {};
Expand Down Expand Up @@ -232,7 +259,7 @@ int BPF_KRETPROBE(kretprobe_sys_connect, int fd)
sort_connection_info(&info.p_conn.conn);
info.p_conn.pid = pid_from_pid_tgid(id);
info.orig_dport = orig_dport;

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: trailing whitespace

bpf_map_update_elem(&pid_tid_to_conn, &id, &info, BPF_ANY); // to support SSL
}

Expand Down Expand Up @@ -678,17 +705,28 @@ int app_ingress(struct __sk_buff *skb) {
return 0;
}

unsigned char buf[12];
if (!tcp_syn(&tcp) || tcp_ack(&tcp)) {
return 0;
}

if (tcp_syn(&tcp) && !tcp_ack(&tcp)) {
bpf_skb_load_bytes(skb, tcp.hdr_len, &buf, 4);
if (skb->len - tcp.hdr_len < sizeof(tp_info_pid_t)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was looking into this, and stumbled upon this check in read_sk_buff()

 if ((skb->len - tcp->hdr_len) < 0) { // less than 0 is a packet we can't parse

both len and hdr_len are unsigned, so I think, to complement this check, we should change read_sk_buff() to

 if ((skb->len - tcp->hdr_len) > skb->len) { // underflow, hdr_len is > len, packet too big

or even better

if (tcp->hdr_len > skb->len) { // package too big

bpf_printk("SYN packet without tp info");
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these prints will have to be removed, but keeping them for now since this is disabled. bpf_dbg_printk doesnt' work because we need to fix it to not use bpf_get_pid_tid, but bpf_get_task info

return 0;
}

s32 len = skb->len-sizeof(u32);
bpf_printk("SYN packed len = %d, offset = %d, hdr_len %d", skb->len, len, tcp.hdr_len);
tp_info_pid_t tp = {0};

bpf_skb_load_bytes(skb, tcp.hdr_len, &tp, sizeof(tp_info_pid_t));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if it may be a good idea to have tp_info_pid_t store some sort of magic number - it's highly unlikely, but if someone stores the exact same amount of bytes there, we will be fooled. Who knows how many other people are doing crazy things like this?

Something like [0xb3l4][tp_info_pid_t]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea! I initially thought of this and said to myself, so if they do, we'll read something random and it's no worse than generating new ID. However, on a second thought, the value someone else puts in could be not random, but a constant and it will make all of our traceIDs be the same. I took your advice to put an ID on the PID (we don't use this field for TCP and it's later overwritten with -1). So I'll check the PID on ingress and if it doesn't match our magic value we'll not take in the packet.


s32 len = skb->len-sizeof(u32);
bpf_printk("Received SYN packed len = %d, offset = %d, hdr_len %d", skb->len, len, tcp.hdr_len);

unsigned char tp_buf[TP_MAX_VAL_LENGTH];
make_tp_string(tp_buf, &tp.tp);
bpf_printk("tp: %s", tp_buf);

bpf_map_update_elem(&incoming_trace_map, &conn, &tp, BPF_ANY);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once we receive a traceID over the wire (TCP packet) we store it for later to be used by the trace code.


bpf_printk("***Data: %x%x", buf[3], buf[2]);
bpf_printk("***Data: %x%x", buf[1], buf[0]);
}
return 0;
}

Expand All @@ -703,17 +741,25 @@ int app_egress(struct __sk_buff *skb) {
return 0;
}

if (!tcp_syn(&tcp) || tcp_ack(&tcp)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd add a comment such as:

// handle SYN only, ignore SYN+ACK packets

return 0;
}

sort_connection_info(&conn);

if (tcp_syn(&tcp) && !tcp_ack(&tcp)) {
tp_info_pid_t *tp = bpf_map_lookup_elem(&outgoing_trace_map, &conn);

if (tp) {
bpf_printk("SYN packed len = %d", skb->len);

u32 val=0xdeadf00d;
unsigned char tp_buf[TP_MAX_VAL_LENGTH];
make_tp_string(tp_buf, &tp->tp);
bpf_printk("tp: %s", tp_buf);

uint16_t pkt_end = skb->data_end - skb->data;
bpf_printk("Changing tail and setting data on syn, end=%d", pkt_end);
bpf_skb_change_tail(skb, pkt_end + sizeof(val), 0);
bpf_skb_store_bytes(skb, pkt_end, &val, sizeof(val), 0);
bpf_skb_change_tail(skb, pkt_end + sizeof(tp_info_pid_t), 0);
bpf_skb_store_bytes(skb, pkt_end, tp, sizeof(tp_info_pid_t), 0);

u32 offset_ip_tot_len = 0;
u32 offset_ip_checksum = 0;
Expand All @@ -724,10 +770,10 @@ int app_egress(struct __sk_buff *skb) {
offset_ip_tot_len = ETH_HLEN + offsetof(struct ipv6hdr, payload_len);
}

u16 new_tot_len = bpf_htons(bpf_ntohs(tcp.tot_len) + sizeof(val));
u16 new_tot_len = bpf_htons(bpf_ntohs(tcp.tot_len) + sizeof(tp_info_pid_t));

bpf_printk("tot_len = %d, tot_len_alt = %d, new_tot_len = %d", tcp.tot_len, bpf_ntohs(tcp.tot_len), new_tot_len);
bpf_printk("new_tot_len_alt = %d, h_proto = %d, skb->len = %d", bpf_ntohs(new_tot_len), tcp.h_proto, skb->len);
bpf_printk("tot_len = %d, new_tot_len = %d", bpf_ntohs(tcp.tot_len), bpf_ntohs(new_tot_len));
bpf_printk("h_proto = %d, skb->len = %d", tcp.h_proto, skb->len);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
bpf_printk("tot_len = %d, new_tot_len = %d", bpf_ntohs(tcp.tot_len), bpf_ntohs(new_tot_len));
bpf_printk("h_proto = %d, skb->len = %d", tcp.h_proto, skb->len);
bpf_printk("tot_len = %u, new_tot_len = %u", bpf_ntohs(tcp.tot_len), bpf_ntohs(new_tot_len));
bpf_printk("h_proto = %u, skb->len = %u", tcp.h_proto, skb->len);


if (offset_ip_checksum) {
bpf_l3_csum_replace(skb, offset_ip_checksum, tcp.tot_len, new_tot_len, sizeof(u16));
Expand Down
2 changes: 1 addition & 1 deletion bpf/protocol_http.h
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ int protocol_http(void *ctx) {
if (meta->type == EVENT_HTTP_CLIENT && !valid_span(tp_p->tp.parent_id)) {
bpf_dbg_printk("Looking for trace id of a client span");
tp_info_pid_t *server_tp = find_parent_trace();
if (server_tp && server_tp->valid) {
if (server_tp && server_tp->valid && valid_trace(server_tp->tp.trace_id)) {
bpf_dbg_printk("Found existing server span for id=%llx", bpf_get_current_pid_tgid());
__builtin_memcpy(info->tp.trace_id, server_tp->tp.trace_id, sizeof(info->tp.trace_id));
__builtin_memcpy(info->tp.parent_id, server_tp->tp.span_id, sizeof(info->tp.parent_id));
Expand Down
2 changes: 1 addition & 1 deletion bpf/protocol_tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ static __always_inline void handle_unknown_tcp_connection(pid_connection_info_t

tp_info_pid_t *server_tp = find_parent_trace();

if (server_tp && server_tp->valid) {
if (server_tp && server_tp->valid && valid_trace(server_tp->tp.trace_id)) {
bpf_dbg_printk("Found existing server tp for client call");
__builtin_memcpy(req->tp.trace_id, server_tp->tp.trace_id, sizeof(req->tp.trace_id));
__builtin_memcpy(req->tp.parent_id, server_tp->tp.span_id, sizeof(req->tp.parent_id));
Expand Down
38 changes: 28 additions & 10 deletions bpf/trace_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,14 @@ static __always_inline void delete_server_trace(trace_key_t *t_key) {
// bpf_dbg_printk("Deleting server span for id=%llx, pid=%d, ns=%d, res = %d", bpf_get_current_pid_tgid(), t_key->p_key.pid, t_key->p_key.ns, res);
}

static __always_inline u8 valid_span(unsigned char *span_id) {
return *((u64 *)span_id) != 0;
}

static __always_inline u8 valid_trace(unsigned char *trace_id) {
return *((u64 *)trace_id) != 0 && *((u64 *)(trace_id + 8)) != 0;
}

static __always_inline void server_or_client_trace(http_connection_metadata_t *meta, connection_info_t *conn, tp_info_pid_t *tp_p) {
if (!meta) {
return;
Expand Down Expand Up @@ -196,10 +204,16 @@ static __always_inline void get_or_create_trace_info(http_connection_metadata_t

if (meta) {
if (meta->type == EVENT_HTTP_CLIENT) {
tp_info_pid_t *in_tp = bpf_map_lookup_elem(&outgoing_trace_map, conn);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before this change the client code only looked for a server trace and if it doesn't find it it will generate the trace information later. Now we look if the TCP egress has setup trace info for us, if we find it we mark as having trace info, we must not regenerate it. If we have our own TCP egress info, but there's server incoming request that wrapped this client call, then we'll overwrite this info.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd actually put that in a comment

if (in_tp) {
found_tp = 1;
tp_p = in_tp;
}

tp_p->pid = -1; // we only want to prevent correlation of duplicate server calls by PID
tp_info_pid_t *server_tp = find_parent_trace();

if (server_tp && server_tp->valid) {
if (server_tp && server_tp->valid && valid_trace(server_tp->tp.trace_id)) {
found_tp = 1;
bpf_dbg_printk("Found existing server tp for client call");
__builtin_memcpy(tp_p->tp.trace_id, server_tp->tp.trace_id, sizeof(tp_p->tp.trace_id));
Expand All @@ -209,14 +223,22 @@ static __always_inline void get_or_create_trace_info(http_connection_metadata_t
//bpf_dbg_printk("Looking up existing trace for connection");
//dbg_print_http_connection_info(conn);

tp_info_pid_t *existing_tp = trace_info_for_connection(conn);

if (correlated_requests(tp_p, existing_tp)) {
tp_info_pid_t *existing_tp = bpf_map_lookup_elem(&incoming_trace_map, conn);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For server requests, we first look for TCP info and then we fall back to black-box info.

if (existing_tp) {
found_tp = 1;
bpf_dbg_printk("Found existing correlated tp for server request");
bpf_dbg_printk("Found incoming (TCP) tp for server request");
__builtin_memcpy(tp_p->tp.trace_id, existing_tp->tp.trace_id, sizeof(tp_p->tp.trace_id));
__builtin_memcpy(tp_p->tp.parent_id, existing_tp->tp.span_id, sizeof(tp_p->tp.parent_id));
}
} else {
existing_tp = trace_info_for_connection(conn);

if (correlated_requests(tp_p, existing_tp)) {
found_tp = 1;
bpf_dbg_printk("Found existing correlated tp for server request");
__builtin_memcpy(tp_p->tp.trace_id, existing_tp->tp.trace_id, sizeof(tp_p->tp.trace_id));
__builtin_memcpy(tp_p->tp.parent_id, existing_tp->tp.span_id, sizeof(tp_p->tp.parent_id));
}
}
}
}

Expand Down Expand Up @@ -276,8 +298,4 @@ static __always_inline void get_or_create_trace_info(http_connection_metadata_t
return;
}

static __always_inline u8 valid_span(unsigned char *span_id) {
return *((u64 *)span_id) != 0;
}

#endif
16 changes: 15 additions & 1 deletion bpf/tracing.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,20 @@ struct {
__uint(pinning, LIBBPF_PIN_BY_NAME);
} trace_map SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__type(key, connection_info_t); // key: the connection info
__type(value, tp_info_pid_t); // value: traceparent info
__uint(max_entries, MAX_CONCURRENT_REQUESTS);
} incoming_trace_map SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__type(key, connection_info_t); // key: the connection info
__type(value, tp_info_pid_t); // value: traceparent info
__uint(max_entries, MAX_CONCURRENT_REQUESTS);
} outgoing_trace_map SEC(".maps");

static __always_inline void make_tp_string(unsigned char *buf, tp_info_t *tp) {
// Version
*buf++ = '0'; *buf++ = '0'; *buf++ = '-';
Expand All @@ -34,7 +48,7 @@ static __always_inline void make_tp_string(unsigned char *buf, tp_info_t *tp) {
}

static __always_inline tp_info_pid_t *trace_info_for_connection(connection_info_t *conn) {
return (tp_info_pid_t *)bpf_map_lookup_elem(&trace_map, conn);
return (tp_info_pid_t *)bpf_map_lookup_elem(&trace_map, conn);
}

static __always_inline u64 current_epoch(u64 ts) {
Expand Down
6 changes: 6 additions & 0 deletions pkg/internal/ebpf/gotracer/bpf_arm64_bpfel.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified pkg/internal/ebpf/gotracer/bpf_arm64_bpfel.o
Binary file not shown.
6 changes: 6 additions & 0 deletions pkg/internal/ebpf/gotracer/bpf_debug_arm64_bpfel.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified pkg/internal/ebpf/gotracer/bpf_debug_arm64_bpfel.o
Binary file not shown.
Loading
Loading