Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bluetooth: fix HCI command / data deadlock #52536

Merged
merged 4 commits into from
Dec 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 129 additions & 57 deletions subsys/bluetooth/host/conn.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ LOG_MODULE_REGISTER(bt_conn);

struct tx_meta {
struct bt_conn_tx *tx;
/* This flag indicates if the current buffer has already been partially
* sent to the controller (ie, the next fragments should be sent as
* continuations).
*/
bool is_cont;
};

#define tx_data(buf) ((struct tx_meta *)net_buf_user_data(buf))
Expand Down Expand Up @@ -429,6 +434,8 @@ int bt_conn_send_cb(struct bt_conn *conn, struct net_buf *buf,
tx_data(buf)->tx = NULL;
}

tx_data(buf)->is_cont = false;

net_buf_put(&conn->tx_queue, buf);
return 0;
}
Expand Down Expand Up @@ -497,24 +504,41 @@ static int send_iso(struct bt_conn *conn, struct net_buf *buf, uint8_t flags)
return bt_send(buf);
}

static bool send_frag(struct bt_conn *conn, struct net_buf *buf, uint8_t flags,
bool always_consume)
static inline uint16_t conn_mtu(struct bt_conn *conn)
{
#if defined(CONFIG_BT_BREDR)
if (conn->type == BT_CONN_TYPE_BR || !bt_dev.le.acl_mtu) {
return bt_dev.br.mtu;
}
#endif /* CONFIG_BT_BREDR */
#if defined(CONFIG_BT_ISO)
if (conn->type == BT_CONN_TYPE_ISO && bt_dev.le.iso_mtu) {
return bt_dev.le.iso_mtu;
}
#endif /* CONFIG_BT_ISO */
#if defined(CONFIG_BT_CONN)
return bt_dev.le.acl_mtu;
#else
return 0;
#endif /* CONFIG_BT_CONN */
}

static int do_send_frag(struct bt_conn *conn, struct net_buf *buf, uint8_t flags)
{
struct bt_conn_tx *tx = tx_data(buf)->tx;
uint32_t *pending_no_cb;
unsigned int key;
int err = 0;

LOG_DBG("conn %p buf %p len %u flags 0x%02x", conn, buf, buf->len, flags);

/* Wait until the controller can accept ACL packets */
k_sem_take(bt_conn_get_pkts(conn), K_FOREVER);

/* Check for disconnection while waiting for pkts_sem */
if (conn->state != BT_CONN_CONNECTED) {
err = -ENOTCONN;
goto fail;
}

LOG_DBG("conn %p buf %p len %u flags 0x%02x", conn, buf, buf->len,
flags);

/* Add to pending, it must be done before bt_buf_set_type */
key = irq_lock();
if (tx) {
Expand Down Expand Up @@ -552,12 +576,21 @@ static bool send_frag(struct bt_conn *conn, struct net_buf *buf, uint8_t flags,
(*pending_no_cb)--;
}
irq_unlock(key);

/* We don't want to end up in a situation where send_acl/iso
* returns the same error code as when we don't get a buffer in
* time.
*/
err = -EIO;
goto fail;
}

return true;
return 0;

fail:
/* If we get here, something has seriously gone wrong:
* We also need to destroy the `parent` buf.
*/
k_sem_give(bt_conn_get_pkts(conn));
if (tx) {
/* `buf` might not get destroyed, and its `tx` pointer will still be reachable.
Expand All @@ -567,35 +600,41 @@ static bool send_frag(struct bt_conn *conn, struct net_buf *buf, uint8_t flags,
conn_tx_destroy(conn, tx);
}

if (always_consume) {
net_buf_unref(buf);
}
return false;
return err;
}

static inline uint16_t conn_mtu(struct bt_conn *conn)
static int send_frag(struct bt_conn *conn,
struct net_buf *buf, struct net_buf *frag,
uint8_t flags)
{
#if defined(CONFIG_BT_BREDR)
if (conn->type == BT_CONN_TYPE_BR || !bt_dev.le.acl_mtu) {
return bt_dev.br.mtu;
/* Check if the controller can accept ACL packets */
if (k_sem_take(bt_conn_get_pkts(conn), K_NO_WAIT)) {
LOG_DBG("no controller bufs");
return -ENOBUFS;
}
#endif /* CONFIG_BT_BREDR */
#if defined(CONFIG_BT_ISO)
if (conn->type == BT_CONN_TYPE_ISO && bt_dev.le.iso_mtu) {
return bt_dev.le.iso_mtu;

/* Add the data to the buffer */
if (frag) {
Thalley marked this conversation as resolved.
Show resolved Hide resolved
uint16_t frag_len = MIN(conn_mtu(conn), net_buf_tailroom(frag));

net_buf_add_mem(frag, buf->data, frag_len);
net_buf_pull(buf, frag_len);
} else {
/* De-queue the buffer now that we know we can send it.
* Only applies if the buffer to be sent is the original buffer,
* and not one of its fragments.
* This buffer was fetched from the FIFO using a peek operation.
*/
buf = net_buf_get(&conn->tx_queue, K_NO_WAIT);
frag = buf;
}
#endif /* CONFIG_BT_ISO */
#if defined(CONFIG_BT_CONN)
return bt_dev.le.acl_mtu;
#else
return 0;
#endif /* CONFIG_BT_CONN */

return do_send_frag(conn, frag, flags);
}

static struct net_buf *create_frag(struct bt_conn *conn, struct net_buf *buf)
{
struct net_buf *frag;
uint16_t frag_len;

switch (conn->type) {
#if defined(CONFIG_BT_ISO)
Expand All @@ -619,52 +658,55 @@ static struct net_buf *create_frag(struct bt_conn *conn, struct net_buf *buf)

/* Fragments never have a TX completion callback */
tx_data(frag)->tx = NULL;

frag_len = MIN(conn_mtu(conn), net_buf_tailroom(frag));

net_buf_add_mem(frag, buf->data, frag_len);
net_buf_pull(buf, frag_len);
tx_data(frag)->is_cont = false;

return frag;
}

static bool send_buf(struct bt_conn *conn, struct net_buf *buf)
static int send_buf(struct bt_conn *conn, struct net_buf *buf)
{
struct net_buf *frag;
uint8_t flags;
int err;

LOG_DBG("conn %p buf %p len %u", conn, buf, buf->len);

/* Send directly if the packet fits the ACL MTU */
if (buf->len <= conn_mtu(conn)) {
return send_frag(conn, buf, FRAG_SINGLE, false);
}

/* Create & enqueue first fragment */
frag = create_frag(conn, buf);
if (!frag) {
return false;
}

if (!send_frag(conn, frag, FRAG_START, true)) {
return false;
if (buf->len <= conn_mtu(conn) && !tx_data(buf)->is_cont) {
LOG_DBG("send single");
return send_frag(conn, buf, NULL, FRAG_SINGLE);
}

LOG_DBG("start fragmenting");
/*
* Send the fragments. For the last one simply use the original
* buffer (which works since we've used net_buf_pull on it.
* buffer (which works since we've used net_buf_pull on it).
*/
flags = FRAG_START;
if (tx_data(buf)->is_cont) {
flags = FRAG_CONT;
}

while (buf->len > conn_mtu(conn)) {
frag = create_frag(conn, buf);
if (!frag) {
return false;
return -ENOMEM;
}

if (!send_frag(conn, frag, FRAG_CONT, true)) {
return false;
err = send_frag(conn, buf, frag, flags);
if (err) {
LOG_DBG("%p failed, mark as existing frag", buf);
tx_data(buf)->is_cont = flags != FRAG_START;
net_buf_unref(frag);
return err;
}

flags = FRAG_CONT;
}

return send_frag(conn, buf, FRAG_END, false);
LOG_DBG("last frag");
tx_data(buf)->is_cont = true;
return send_frag(conn, buf, NULL, FRAG_END);
}

static struct k_poll_signal conn_change =
Expand Down Expand Up @@ -732,10 +774,26 @@ static int conn_prepare_events(struct bt_conn *conn,

LOG_DBG("Adding conn %p to poll list", conn);

k_poll_event_init(&events[0],
K_POLL_TYPE_FIFO_DATA_AVAILABLE,
K_POLL_MODE_NOTIFY_ONLY,
&conn->tx_queue);
bool buffers_available = k_sem_count_get(bt_conn_get_pkts(conn)) > 0;
bool packets_waiting = !k_fifo_is_empty(&conn->tx_queue);

if (packets_waiting && !buffers_available) {
/* Only resume sending when the controller has buffer space
* available for this connection.
*/
LOG_DBG("wait on ctlr buffers");
k_poll_event_init(&events[0],
K_POLL_TYPE_SEM_AVAILABLE,
K_POLL_MODE_NOTIFY_ONLY,
bt_conn_get_pkts(conn));
} else {
/* Wait until there is more data to send. */
LOG_DBG("wait on host fifo");
k_poll_event_init(&events[0],
K_POLL_TYPE_FIFO_DATA_AVAILABLE,
K_POLL_MODE_NOTIFY_ONLY,
&conn->tx_queue);
}
events[0].tag = BT_EVENT_CONN_TX_QUEUE;

return 0;
Expand Down Expand Up @@ -779,6 +837,7 @@ int bt_conn_prepare_events(struct k_poll_event events[])
void bt_conn_process_tx(struct bt_conn *conn)
{
struct net_buf *buf;
int err;

LOG_DBG("conn %p", conn);

Expand All @@ -789,10 +848,23 @@ void bt_conn_process_tx(struct bt_conn *conn)
return;
}

/* Get next ACL packet for connection */
buf = net_buf_get(&conn->tx_queue, K_NO_WAIT);
/* Get next ACL packet for connection. The buffer will only get dequeued
* if there is a free controller buffer to put it in.
*
* Important: no operations should be done on `buf` until it is properly
* dequeued from the FIFO, using the `net_buf_get()` API.
*/
buf = k_fifo_peek_head(&conn->tx_queue);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem with this is that you're skipping the fragment handling that net_buf_get() takes care of. We don't (AFAIK) currently take advantage of the net_buf fragment feature in the Bluetooth subsystem, but at least we've used the proper net_buf APIs until now that should keep the code working if we did use fragments.

At the very least, in addition to asserting that buf != NULL, you should also BT_ASSERT((buf->flags & NET_BUF_FRAGS) == 0);. Though even that's pretty unclean, since these are really net_buf-internal features.

A cleaner way would be to introduce a new net_buf_peek() API that does the same as net_buf_get() but also fetches any pending fragments.

Yet another, perhaps more long term solution, would be to finally move the net_buf fragments list into it's own independent list that's not mangled together with the linked list pointer as part of the same union (which is the reason for the NET_BUF_FRAGS flag and why fragments are placed as independent objects into the FIFO). Currently the net_buf definition starts off like this:

struct net_buf {
	union {
		/** Allow placing the buffer into sys_slist_t */
		sys_snode_t node;

		/** Fragments associated with this buffer. */
		struct net_buf *frags;
	};

The penalty of moving frags out from the union would be that the struct size increases by the size of a pointer, but the benefit would be that we don't have this hassle of serializing & deserializing fragments whenever buffers pass through LIFO or FIFO objects.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the very least, in addition to asserting that buf != NULL, you should also BT_ASSERT((buf->flags & NET_BUF_FRAGS) == 0);. Though even that's pretty unclean, since these are really net_buf-internal features.

Oh, and in addition to the assert, you need to explicitly set buf->frags = NULL since the k_fifo implementation leaves the linked list pointer untouched when removing from the list.

Also see #38830 (review) and related discussion in the PR.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is that really a problem since I'm using the _get before passing it to bt_send? That takes care of the fragments, and the peek is a non-destructive operation afaik.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the very least you need to clear the buf->frags pointer since k_fifo_peek_head() doesn't do that for you. You have the EIO branch below where e.g. net_buf_unref() is called, and net_buf_unref() will also unref the frags pointer if it is non-NULL. Same goes for any other net_buf API calls you might do on the buffer before calling net_buf_get()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, for correctness, it would be better to get a proper reference to the buffer when peeking, i.e. call net_buf_ref() on the pointer. Then, when you finally call net_buf_ger() the code would look something like the following (i.e.no need for the dequeued variable):

net_buf_unref(buf);
buf = net_buf_get(...);

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, and in addition to the assert, you need to explicitly set buf->frags = NULL since the k_fifo implementation leaves the linked list pointer untouched when removing from the list.

I just tried and it didn't work. I think it's because buf->frags is in a union with the node ptr.
So if I overwrite it with NULL while it is still on the queue, and it ends up that I don't pull it from the fifo (e.g. the sem_take fails), the fifo list is effectively broken.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, good point. And this is another reason why using peak() together with the union is bad - you end up with the buffer in an inconsistent state until you actually remove it from from the FIFO.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The short term solution would then probably be to document this clearly and try to make the code flow path from peek() to get() as short and obvious as possible.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep agreed. I will place a comment above the peek saying this.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so I reverted the assert since #52760 got merged.

BT_ASSERT(buf);
if (!send_buf(conn, buf)) {

/* Since we used `peek`, the queue still owns the reference to the
* buffer, so we need to take an explicit additional reference here.
*/
buf = net_buf_ref(buf);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add a code comment above this, something along the lines of "Since we used peek(), the queue still owns the reference to the buffer, so we need to take an explicit additional reference here.

err = send_buf(conn, buf);
net_buf_unref(buf);

if (err == -EIO) {
struct bt_conn_tx *tx = tx_data(buf)->tx;

tx_data(buf)->tx = NULL;
Expand Down
8 changes: 8 additions & 0 deletions subsys/bluetooth/host/hci_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -2465,6 +2465,13 @@ static void process_events(struct k_poll_event *ev, int count)
switch (ev->state) {
case K_POLL_STATE_SIGNALED:
break;
case K_POLL_STATE_SEM_AVAILABLE:
/* After this fn is exec'd, `bt_conn_prepare_events()`
* will be called once again, and this time buffers will
* be available, so the FIFO will be added to the poll
* list instead of the ctlr buffers semaphore.
*/
break;
case K_POLL_STATE_FIFO_DATA_AVAILABLE:
if (ev->tag == BT_EVENT_CMD_TX) {
send_cmd();
Expand Down Expand Up @@ -2524,6 +2531,7 @@ static void hci_tx_thread(void *p1, void *p2, void *p3)
events[0].state = K_POLL_STATE_NOT_READY;
ev_count = 1;

/* This adds the FIFO per-connection */
if (IS_ENABLED(CONFIG_BT_CONN) || IS_ENABLED(CONFIG_BT_ISO)) {
ev_count += bt_conn_prepare_events(&events[1]);
}
Expand Down
15 changes: 10 additions & 5 deletions tests/bluetooth/bsim_bt/bsim_test_l2cap_stress/prj.conf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@ CONFIG_BT_GAP_AUTO_UPDATE_CONN_PARAMS=n
# L2CAP MPS
# 23+27+27=77 makes exactly three full packets
CONFIG_BT_L2CAP_TX_MTU=77
CONFIG_BT_BUF_ACL_TX_SIZE=77

# Use this to send L2CAP PDUs without any fragmentation.
# In this particular case, we prefer fragmenting to test that code path.
# CONFIG_BT_BUF_ACL_TX_SIZE=81
Thalley marked this conversation as resolved.
Show resolved Hide resolved

# L2CAP PDUs will be fragmented in 3 ACL packets.
CONFIG_BT_BUF_ACL_TX_SIZE=27

CONFIG_BT_BUF_ACL_TX_COUNT=4

# The minimum value for this is
Expand All @@ -31,14 +38,12 @@ CONFIG_BT_BUF_ACL_RX_SIZE=81
# to keep it that way as to stress the stack as much as possible.
CONFIG_BT_L2CAP_TX_BUF_COUNT=6

CONFIG_BT_CTLR_DATA_LENGTH_MAX=27
CONFIG_BT_CTLR_RX_BUFFERS=10
# The ring buffer now has space for three times as much data
# (default 27, 3*27=81), so that it does not run out of data
# while waiting for new SDUs to be queued.
CONFIG_BT_CTLR_DATA_LENGTH_MAX=81

CONFIG_BT_MAX_CONN=10

CONFIG_LOG=y
CONFIG_ASSERT=y
CONFIG_BT_DEBUG_LOG=y
CONFIG_NET_BUF_POOL_USAGE=y
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ extern enum bst_result_t bst_result;
}


#define WAIT_SECONDS 220 /* seconds */
#define WAIT_SECONDS 270 /* seconds */
#define WAIT_TIME (WAIT_SECONDS * USEC_PER_SEC) /* microseconds*/

#define FAIL(...) \
Expand Down
Loading