Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
f601c9f
net-tcp_bbr: broaden app-limited rate sample detection
nealcardwell Jun 11, 2019
9cb2d74
net-tcp_bbr: v2: shrink delivered_mstamp, first_tx_mstamp to u32 to f…
nealcardwell Jun 25, 2018
7679309
net-tcp_bbr: v2: snapshot packets in flight at transmit time and pass…
nealcardwell Aug 5, 2017
4e589c6
net-tcp_bbr: v2: count packets lost over TCP rate sampling interval
nealcardwell Oct 13, 2017
ade2a0e
net-tcp_bbr: v2: export FLAG_ECE in rate_sample.is_ece
nealcardwell Nov 19, 2018
1f4015e
net-tcp_bbr: v2: introduce ca_ops->skb_marked_lost() CC module callba…
nealcardwell Aug 8, 2018
f82a3d3
net-tcp_bbr: v2: adjust skb tx.in_flight upon merge in tcp_shifted_skb()
nealcardwell May 2, 2019
bfa26db
net-tcp_bbr: v2: adjust skb tx.in_flight upon split in tcp_fragment()
nealcardwell May 2, 2019
0fa4869
net-tcp: add new ca opts flag TCP_CONG_WANTS_CE_EVENTS
yousukseung May 24, 2018
c20e56d
net-tcp: re-generalize TSO sizing in TCP CC module API
nealcardwell Sep 27, 2019
a5cc006
net-tcp: add fast_ack_mode=1: skip rwin check in tcp_fast_ack_mode__t…
nealcardwell Nov 16, 2019
4fef7ac
net-tcp_bbr: v2: record app-limited status of TLP-repaired flight
jianfenw Jun 19, 2020
dc4a1f8
net-tcp_bbr: v2: inform CC module of losses repaired by TLP probe
jianfenw Jun 16, 2020
9f5cbd8
net-tcp_bbr: v2: introduce is_acking_tlp_retrans_seq into rate_sample
nealcardwell Sep 21, 2020
40f1ce9
tcp: introduce per-route feature RTAX_FEATURE_ECN_LOW
Jul 14, 2023
aa27c22
net-tcp_bbr: v3: update TCP "bbr" congestion control module to BBRv3
nealcardwell Jun 11, 2019
5ad789e
net-tcp_bbr: v3: ensure ECN-enabled BBR flows set ECT on retransmits
Jun 11, 2021
a1d32ad
tcp: export TCPI_OPT_ECN_LOW in tcp_info tcpi_options field
nealcardwell Jul 24, 2023
a7743a2
net-test: add .config for kernel circa v5.10, with many TCP CC module…
soheilhy Jan 24, 2016
04ed1b4
net-test: adds a gce-install.sh script to build and install kernel on…
soheilhy Jan 24, 2016
e7db863
net-test: scripts for testing BBR with upstream Linux kernels
nealcardwell Aug 29, 2018
f60d60e
net-tcp-bbr1: for testing, a copy of BBRv1
nealcardwell Jul 20, 2023
c931462
net-test: udpate config.gce to recent kernel and enable BBR1 for testing
nealcardwell Jul 20, 2023
118c5d9
iproute2: a .patch file for iproute2: ss: output TCP BBRv3 diag infor…
nealcardwell Jul 23, 2023
2dec5d0
iproute2: a .patch file for iproute2: ip: introduce the ecn_low per-r…
nealcardwell Jul 23, 2023
aaf9327
iproute2: a .patch file for iproute2: ss: display "ecn_low" if tcp_in…
nealcardwell Jul 24, 2023
6e321d1
net-tcp_bbr: v3: add a README.md for TCP BBR v3 release
nealcardwell Jul 23, 2019
d257091
Adding EXPORT_SYMBOL macro for tcp_tso_autosize function, for referen…
ashutoshs25 Nov 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,9 @@ x509.genkey
/allrandom.config
/allyes.config

# tmp build/install directory for /gce-install.sh:
/gce/

# Kconfig savedefconfig output
/defconfig

Expand Down
165 changes: 165 additions & 0 deletions 0001-ss-output-TCP-BBRv3-diag-information.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
From ca7f11ebc4d4a99ccfd44be8555d505b26996c12 Mon Sep 17 00:00:00 2001
From: Arjun Roy <arjunroy@google.com>
Date: Mon, 25 Jul 2022 12:49:35 -0400
Subject: [PATCH 2/2] ss: output TCP BBRv3 diag information

Add logic for printing diag information for TCP BBRv3 congestion
control. This commit leaves in place the support for printing the
earlier TCP BBRv1 congestion control information.

Both BBRv1 and BBRv3 are using the same enum value. The BBRv3 struct
starts with the same data as BBRv1, so it is is backward-compatible
with BBRv1, to allow lder ss binaries to print basic information for
BBRv3. We use the size of the returned data and the version field to
check the version of the data.

Signed-off-by: Arjun Roy <arjunroy@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David Morley <morleyd@google.com>
---
include/uapi/linux/inet_diag.h | 23 ++++++++++++
misc/ss.c | 66 +++++++++++++++++++++++++++++++++-
2 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index d81cb69a26a9..dca833ecb783 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -229,6 +229,29 @@ struct tcp_bbr_info {
__u32 bbr_min_rtt; /* min-filtered RTT in uSec */
__u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */
__u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */
+ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */
+ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */
+ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */
+ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */
+ __u8 bbr_mode; /* current bbr_mode in state machine */
+ __u8 bbr_phase; /* current state machine phase */
+ __u8 unused1; /* alignment padding; not used yet */
+ __u8 bbr_version; /* BBR algorithm version */
+ __u32 bbr_inflight_lo; /* lower short-term data volume bound */
+ __u32 bbr_inflight_hi; /* higher long-term data volume bound */
+ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */
+};
+
+/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
+enum tcp_bbr_phase {
+ BBR_PHASE_INVALID = 0,
+ BBR_PHASE_STARTUP = 1,
+ BBR_PHASE_DRAIN = 2,
+ BBR_PHASE_PROBE_RTT = 3,
+ BBR_PHASE_PROBE_BW_UP = 4,
+ BBR_PHASE_PROBE_BW_DOWN = 5,
+ BBR_PHASE_PROBE_BW_CRUISE = 6,
+ BBR_PHASE_PROBE_BW_REFILL = 7,
};

union tcp_cc_info {
diff --git a/misc/ss.c b/misc/ss.c
index e9d813596b91..5f413118f0dd 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -912,6 +912,7 @@ struct tcpstat {
bool app_limited;
struct dctcpstat *dctcp;
struct tcp_bbr_info *bbr_info;
+ unsigned int bbr_info_len;
};

/* SCTP assocs share the same inode number with their parent endpoint. So if we
@@ -2585,6 +2586,29 @@ static void sctp_stats_print(struct sctp_info *s)
out(" fraginl:%d", s->sctpi_s_frag_interleave);
}

+static const char* bbr_phase_to_str(enum tcp_bbr_phase phase)
+{
+ switch (phase) {
+ case BBR_PHASE_STARTUP:
+ return "STARTUP";
+ case BBR_PHASE_DRAIN:
+ return "DRAIN";
+ case BBR_PHASE_PROBE_RTT:
+ return "PROBE_RTT";
+ case BBR_PHASE_PROBE_BW_UP:
+ return "PROBE_BW_UP";
+ case BBR_PHASE_PROBE_BW_DOWN:
+ return "PROBE_BW_DOWN";
+ case BBR_PHASE_PROBE_BW_CRUISE:
+ return "PROBE_BW_CRUISE";
+ case BBR_PHASE_PROBE_BW_REFILL:
+ return "PROBE_BW_REFILL";
+ case BBR_PHASE_INVALID:
+ default:
+ return "INVALID";
+ }
+}
+
static void tcp_stats_print(struct tcpstat *s)
{
char b1[64];
@@ -2658,7 +2682,14 @@ static void tcp_stats_print(struct tcpstat *s)
}

if (s->bbr_info) {
- __u64 bw;
+ /* All versions of the BBR algorithm use the INET_DIAG_BBRINFO
+ * enum value. Later versions of the tcp_bbr_info struct are
+ * backward-compatible with earlier versions, to allow older ss
+ * binaries to print basic information for newer versions of
+ * the algorithm. We use the size of the returned tcp_bbr_info
+ * struct to decide how much to print.
+ */
+ __u64 bw, bw_hi, bw_lo;

bw = s->bbr_info->bbr_bw_hi;
bw <<= 32;
@@ -2673,6 +2704,38 @@ static void tcp_stats_print(struct tcpstat *s)
if (s->bbr_info->bbr_cwnd_gain)
out(",cwnd_gain:%g",
(double)s->bbr_info->bbr_cwnd_gain / 256.0);
+
+ if (s->bbr_info_len >=
+ (offsetof(struct tcp_bbr_info, bbr_extra_acked) +
+ sizeof(__u32))) {
+
+ bw_hi = s->bbr_info->bbr_bw_hi_msb;
+ bw_hi <<= 32;
+ bw_hi |= s->bbr_info->bbr_bw_hi_lsb;
+
+ bw_lo = s->bbr_info->bbr_bw_lo_msb;
+ bw_lo <<= 32;
+ bw_lo |= s->bbr_info->bbr_bw_lo_lsb;
+
+ out(",version:%u", s->bbr_info->bbr_version);
+ if (bw_hi != ~0UL)
+ out(",bw_hi:%sbps", sprint_bw(b1, bw_hi * 8.0));
+ if (bw_lo != ~0UL)
+ out(",bw_lo:%sbps", sprint_bw(b1, bw_lo * 8.0));
+ if (s->bbr_info->bbr_inflight_hi != ~0U)
+ out(",inflight_hi:%u",
+ s->bbr_info->bbr_inflight_hi);
+ if (s->bbr_info->bbr_inflight_lo != ~0U)
+ out(",inflight_lo:%u",
+ s->bbr_info->bbr_inflight_lo);
+ out(",extra_acked:%u", s->bbr_info->bbr_extra_acked);
+ out(",mode:%d", (int)s->bbr_info->bbr_mode);
+ out(",phase:%s",
+ bbr_phase_to_str(
+ (enum tcp_bbr_phase)
+ s->bbr_info->bbr_phase));
+ }
+
out(")");
}

@@ -3147,6 +3210,7 @@ static void tcp_show_info(const struct nlmsghdr *nlh, struct inet_diag_msg *r,
s.bbr_info = calloc(1, sizeof(*s.bbr_info));
if (s.bbr_info && bbr_info)
memcpy(s.bbr_info, bbr_info, len);
+ s.bbr_info_len = len;
}

if (rtt > 0 && info->tcpi_snd_mss && info->tcpi_snd_cwnd) {
--
2.41.0.487.g6d72f3e995-goog

120 changes: 120 additions & 0 deletions 0002-ip-introduce-the-ecn_low-per-route-feature.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
From 537b1b761e1d0036923adba7a80d3655cfff095d Mon Sep 17 00:00:00 2001
From: David Morley <morleyd@google.com>
Date: Fri, 21 Jul 2023 09:20:46 +0000
Subject: [PATCH 1/2] ip: introduce the ecn_low per-route feature

Add a a new "ecn_low" feature that administrators can add to a
particular ip route. The ecn_low feature indicates that the given
destination network is a low-latency ECN environment, meaning both
that ECN CE marks are applied by the network using a low-latency
marking threshold and also that TCP endpoints provide precise
per-data-segment ECN feedback in ACKs (where the ACK ECE flag echoes
the received CE status of all newly-acknowledged data segments). This
ecn_low feature indication can be used by congestion control
algorithms to decide how to interpret ECN signals over the given
destination network.

Signed-off-by: David Morley <morleyd@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
---
include/uapi/linux/rtnetlink.h | 4 +++-
ip/iproute.c | 8 +++++++-
man/man8/ip-route.8.in | 19 ++++++++++++++-----
3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 2132e941b93a..66e3237ea915 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -506,9 +506,11 @@ enum {
#define RTAX_FEATURE_SACK (1 << 1)
#define RTAX_FEATURE_TIMESTAMP (1 << 2)
#define RTAX_FEATURE_ALLFRAG (1 << 3)
+#define RTAX_FEATURE_ECN_LOW (1 << 4)

#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \
- RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG)
+ RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG | \
+ RTAX_FEATURE_ECN_LOW)

struct rta_session {
__u8 proto;
diff --git a/ip/iproute.c b/ip/iproute.c
index fdf1f9a9dd0a..bd5d783cfdbb 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -96,7 +96,7 @@ static void usage(void)
"PREF := [ low | medium | high ]\n"
"TIME := NUMBER[s|ms]\n"
"BOOL := [1|0]\n"
- "FEATURES := ecn\n"
+ "FEATURES := [ ecn | ecn_low ]\n"
"ENCAPTYPE := [ mpls | ip | ip6 | seg6 | seg6local | rpl | ioam6 | xfrm ]\n"
"ENCAPHDR := [ MPLSLABEL | SEG6HDR | SEG6LOCAL | IOAM6HDR | XFRMINFO ]\n"
"SEG6HDR := [ mode SEGMODE ] segs ADDR1,ADDRi,ADDRn [hmac HMACKEYID] [cleanup]\n"
@@ -350,6 +350,10 @@ static void print_rtax_features(FILE *fp, unsigned int features)
print_null(PRINT_ANY, "ecn", "ecn ", NULL);
features &= ~RTAX_FEATURE_ECN;
}
+ if (features & RTAX_FEATURE_ECN_LOW) {
+ print_null(PRINT_ANY, "ecn_low", "ecn_low ", NULL);
+ features &= ~RTAX_FEATURE_ECN_LOW;
+ }

if (features)
print_0xhex(PRINT_ANY,
@@ -1349,6 +1353,8 @@ static int iproute_modify(int cmd, unsigned int flags, int argc, char **argv)

if (strcmp(*argv, "ecn") == 0)
features |= RTAX_FEATURE_ECN;
+ else if (strcmp(*argv, "ecn_low") == 0)
+ features |= RTAX_FEATURE_ECN_LOW;
else
invarg("\"features\" value not valid\n", *argv);
break;
diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in
index c2b00833e507..10f35f38e6a2 100644
--- a/man/man8/ip-route.8.in
+++ b/man/man8/ip-route.8.in
@@ -181,7 +181,7 @@ throw " | " unreachable " | " prohibit " | " blackhole " | " nat " ]"

.ti -8
.IR FEATURES " := [ "
-.BR ecn " | ]"
+.BR ecn " | " ecn_low " ] "

.ti -8
.IR PREF " := [ "
@@ -544,16 +544,25 @@ The default value is zero, meaning to use Slow Start value.

.TP
.BI features " FEATURES " (Linux 3.18+ only)
-Enable or disable per-route features. Only available feature at this
-time is
-.B ecn
-to enable explicit congestion notification when initiating connections to the
+Enable or disable per-route features. Available features include:
+
+.BI ecn
+- to enable explicit congestion notification when initiating connections to the
given destination network.
When responding to a connection request from the given network, ecn will
also be used even if the
.B net.ipv4.tcp_ecn
sysctl is set to 0.

+.BI ecn_low
+- to indicate that the given destination network is a low-latency ECN
+environment, meaning both that ECN CE marks are applied by the network using a
+low-latency marking threshold and also that TCP endpoints provide precise
+per-data-segment ECN feedback in ACKs (where the ACK ECE flag echoes the
+received CE status of all newly-acknowledged data segments). This ecn_low
+feature indication can be used by congestion control algorithms to decide how
+to interpret ECN signals over the given destination network (Linux 6.7+ only).
+
.TP
.BI quickack " BOOL " "(Linux 3.11+ only)"
Enable or disable quick ack for connections to this destination.
--
2.41.0.487.g6d72f3e995-goog

59 changes: 59 additions & 0 deletions 0003-ss-display-ecn_low-if-tcp_info-tcpi_options-TCPI_OPT.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
From 107339d7f48c95ae8a7461150e143fc53b08fea9 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sun, 23 Jul 2023 23:33:21 -0400
Subject: [PATCH] ss: display "ecn_low" if tcp_info tcpi_options
TCPI_OPT_ECN_LOW bit is set

Display "ecn_low" if the TCPI_OPT_ECN_LOW bit is set in the
tcpi_options field in tcp_info.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
---
include/uapi/linux/tcp.h | 1 +
misc/ss.c | 4 ++++
2 files changed, 5 insertions(+)

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 4ee5d7721c6f..238e95d7693c 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail {
#define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */
#define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */
#define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */
+#define TCPI_OPT_ECN_LOW 64 /* Low-latency ECN configured at init */

/*
* Sender's congestion state indicating normal or abnormal situations
diff --git a/misc/ss.c b/misc/ss.c
index 5f413118f0dd..634285adb7d3 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -907,6 +907,7 @@ struct tcpstat {
bool has_sack_opt;
bool has_ecn_opt;
bool has_ecnseen_opt;
+ bool has_ecn_low_opt;
bool has_fastopen_opt;
bool has_wscale_opt;
bool app_limited;
@@ -2621,6 +2622,8 @@ static void tcp_stats_print(struct tcpstat *s)
out(" ecn");
if (s->has_ecnseen_opt)
out(" ecnseen");
+ if (s->has_ecn_low_opt)
+ out(" ecn_low");
if (s->has_fastopen_opt)
out(" fastopen");
if (s->cong_alg[0])
@@ -3134,6 +3137,7 @@ static void tcp_show_info(const struct nlmsghdr *nlh, struct inet_diag_msg *r,
s.has_sack_opt = TCPI_HAS_OPT(info, TCPI_OPT_SACK);
s.has_ecn_opt = TCPI_HAS_OPT(info, TCPI_OPT_ECN);
s.has_ecnseen_opt = TCPI_HAS_OPT(info, TCPI_OPT_ECN_SEEN);
+ s.has_ecn_low_opt = TCPI_HAS_OPT(info, TCPI_OPT_ECN_LOW);
s.has_fastopen_opt = TCPI_HAS_OPT(info, TCPI_OPT_SYN_DATA);
}

--
2.41.0.487.g6d72f3e995-goog

Loading