struct mt76_sw_queue *sq = &dev->q_tx[qid];
        struct mt76_queue *q = sq->q;
        struct mt76_queue_entry entry;
+       unsigned int n_swq_queued[4] = {};
+       unsigned int n_queued = 0;
        bool wake = false;
-       int last;
+       int i, last;
 
        if (!q)
                return;
 
-       spin_lock_bh(&q->lock);
        if (flush)
                last = -1;
        else
                last = readl(&q->regs->dma_idx);
 
-       while (q->queued && q->tail != last) {
+       while ((q->queued > n_queued) && q->tail != last) {
                mt76_dma_tx_cleanup_idx(dev, q, q->tail, &entry);
                if (entry.schedule)
-                       dev->q_tx[entry.qid].swq_queued--;
+                       n_swq_queued[entry.qid]++;
 
                q->tail = (q->tail + 1) % q->ndesc;
-               q->queued--;
+               n_queued++;
 
-               if (entry.skb) {
-                       spin_unlock_bh(&q->lock);
+               if (entry.skb)
                        dev->drv->tx_complete_skb(dev, qid, &entry);
-                       spin_lock_bh(&q->lock);
-               }
 
                if (entry.txwi) {
                        mt76_put_txwi(dev, entry.txwi);
                        last = readl(&q->regs->dma_idx);
        }
 
+       spin_lock_bh(&q->lock);
+
+       q->queued -= n_queued;
+       for (i = 0; i < ARRAY_SIZE(n_swq_queued); i++) {
+               if (!n_swq_queued[i])
+                       continue;
+
+               dev->q_tx[i].swq_queued -= n_swq_queued[i];
+       }
+
        if (flush)
                mt76_dma_sync_idx(dev, q);