From 39771b127b412377d6354893c7d43ee8f2edecfd Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sat, 2 Apr 2016 23:08:06 -0400 Subject: [PATCH 1/8] sock: break up sock_cmsg_snd into __sock_cmsg_snd and loop To process cmsg's of the SOL_SOCKET level in addition to cmsgs of another level, protocols can call sock_cmsg_send(). This causes a double walk on the cmsghdr list, one for SOL_SOCKET and one for the other level. Extract the inner demultiplex logic from the loop that walks the list, to allow having this called directly from a walker in the protocol specific code. Signed-off-by: Willem de Bruijn Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ net/core/sock.c | 33 ++++++++++++++++++++++----------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 255d3e03727b..03772d4b06e6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1420,6 +1420,8 @@ struct sockcm_cookie { u32 mark; }; +int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, + struct sockcm_cookie *sockc); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc); diff --git a/net/core/sock.c b/net/core/sock.c index b67b9aedb230..66976f88566b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1866,27 +1866,38 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, } EXPORT_SYMBOL(sock_alloc_send_skb); +int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, + struct sockcm_cookie *sockc) +{ + switch (cmsg->cmsg_type) { + case SO_MARK: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->mark = *(u32 *)CMSG_DATA(cmsg); + break; + default: + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(__sock_cmsg_send); + int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc) { struct cmsghdr *cmsg; + int ret; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_SOCKET) continue; - switch (cmsg->cmsg_type) { - case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) - return -EINVAL; - sockc->mark = *(u32 *)CMSG_DATA(cmsg); - break; - default: - return -EINVAL; - } + ret = __sock_cmsg_send(sk, msg, cmsg, sockc); + if (ret) + return ret; } return 0; } From 6db8b963a7a31047573f229492ff6fc0f51cc377 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sat, 2 Apr 2016 23:08:07 -0400 Subject: [PATCH 2/8] tcp: accept SOF_TIMESTAMPING_OPT_ID for passive TFO SOF_TIMESTAMPING_OPT_ID is set to get data-independent IDs to associate timestamps with send calls. For TCP connections, tp->snd_una is used as the starting point to calculate relative IDs. This socket option will fail if set before the handshake on a passive TCP fast open connection with data in SYN or SYN/ACK, since setsockopt requires the connection to be in the ESTABLISHED state. To address these, instead of limiting the option to the ESTABLISHED state, accept the SOF_TIMESTAMPING_OPT_ID option as long as the connection is not in LISTEN or CLOSE states. Signed-off-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Acked-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/sock.c b/net/core/sock.c index 66976f88566b..0a64fe20ce5a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -832,7 +832,8 @@ set_rcvbuf: !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk->sk_protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { - if (sk->sk_state != TCP_ESTABLISHED) { + if ((1 << sk->sk_state) & + (TCPF_CLOSE | TCPF_LISTEN)) { ret = -EINVAL; break; } From 6b084928baac562ed61866f540a96120e9c9ddb7 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sat, 2 Apr 2016 23:08:08 -0400 Subject: [PATCH 3/8] tcp: use one bit in TCP_SKB_CB to mark ACK timestamps Currently, to avoid a cache line miss for accessing skb_shinfo, tcp_ack_tstamp skips socket that do not have SOF_TIMESTAMPING_TX_ACK bit set in sk_tsflags. This is implemented based on an implicit assumption that the SOF_TIMESTAMPING_TX_ACK is set via socket options for the duration that ACK timestamps are needed. To implement per-write timestamps, this check should be removed and replaced with a per-packet alternative that quickly skips packets missing ACK timestamps marks without a cache-line miss. To enable per-packet marking without a cache line miss, use one bit in TCP_SKB_CB to mark a whether a SKB might need a ack tx timestamp or not. Further checks in tcp_ack_tstamp are not modified and work as before. Signed-off-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 3 ++- net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_input.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index f8bb4a4ed3d1..a23282996ca9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -754,7 +754,8 @@ struct tcp_skb_cb { TCPCB_REPAIRED) __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ - /* 1 byte hole */ + __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ + unused:7; __u32 ack_seq; /* Sequence number ACK'd */ union { struct inet_skb_parm h4; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 08b8b960a8ed..ce3c9eb901a0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -432,10 +432,12 @@ static void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb) { if (sk->sk_tsflags) { struct skb_shared_info *shinfo = skb_shinfo(skb); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); sock_tx_timestamp(sk, &shinfo->tx_flags); if (shinfo->tx_flags & SKBTX_ANY_TSTAMP) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; + tcb->txstamp_ack = !!(shinfo->tx_flags & SKBTX_ACK_TSTAMP); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f87b84a75691..a26e2d262358 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3082,7 +3082,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, const struct skb_shared_info *shinfo; /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */ - if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK))) + if (likely(!TCP_SKB_CB(skb)->txstamp_ack)) return; shinfo = skb_shinfo(skb); From 3dd17e63f5131bf2528f34aa5e3e57758175af92 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sat, 2 Apr 2016 23:08:09 -0400 Subject: [PATCH 4/8] sock: accept SO_TIMESTAMPING flags in socket cmsg Accept SO_TIMESTAMPING in control messages of the SOL_SOCKET level as a basis to accept timestamping requests per write. This implementation only accepts TX recording flags (i.e., SOF_TIMESTAMPING_TX_HARDWARE, SOF_TIMESTAMPING_TX_SOFTWARE, SOF_TIMESTAMPING_TX_SCHED, and SOF_TIMESTAMPING_TX_ACK) in control messages. Users need to set reporting flags (e.g., SOF_TIMESTAMPING_OPT_ID) per socket via socket options. This commit adds a tsflags field in sockcm_cookie which is set in __sock_cmsg_send. It only override the SOF_TIMESTAMPING_TX_* bits in sockcm_cookie.tsflags allowing the control message to override the recording behavior per write, yet maintaining the value of other flags. This patch implements validating the control message and setting tsflags in struct sockcm_cookie. Next commits in this series will actually implement timestamping per write for different protocols. Signed-off-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 1 + include/uapi/linux/net_tstamp.h | 10 ++++++++++ net/core/sock.c | 13 +++++++++++++ 3 files changed, 24 insertions(+) diff --git a/include/net/sock.h b/include/net/sock.h index 03772d4b06e6..af012da5e608 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1418,6 +1418,7 @@ void sk_send_sigurg(struct sock *sk); struct sockcm_cookie { u32 mark; + u16 tsflags; }; int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 6d1abea9746e..264e515de16f 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -31,6 +31,16 @@ enum { SOF_TIMESTAMPING_LAST }; +/* + * SO_TIMESTAMPING flags are either for recording a packet timestamp or for + * reporting the timestamp to user space. + * Recording flags can be set both via socket options and control messages. + */ +#define SOF_TIMESTAMPING_TX_RECORD_MASK (SOF_TIMESTAMPING_TX_HARDWARE | \ + SOF_TIMESTAMPING_TX_SOFTWARE | \ + SOF_TIMESTAMPING_TX_SCHED | \ + SOF_TIMESTAMPING_TX_ACK) + /** * struct hwtstamp_config - %SIOCGHWTSTAMP and %SIOCSHWTSTAMP parameter * diff --git a/net/core/sock.c b/net/core/sock.c index 0a64fe20ce5a..315f5e57fffe 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1870,6 +1870,8 @@ EXPORT_SYMBOL(sock_alloc_send_skb); int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, struct sockcm_cookie *sockc) { + u32 tsflags; + switch (cmsg->cmsg_type) { case SO_MARK: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) @@ -1878,6 +1880,17 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, return -EINVAL; sockc->mark = *(u32 *)CMSG_DATA(cmsg); break; + case SO_TIMESTAMPING: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + + tsflags = *(u32 *)CMSG_DATA(cmsg); + if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) + return -EINVAL; + + sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; + sockc->tsflags |= tsflags; + break; default: return -EINVAL; } From 24025c465f77c3585f73450bab19501b2edd6fba Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sat, 2 Apr 2016 23:08:10 -0400 Subject: [PATCH 5/8] ipv4: process socket-level control messages in IPv4 Process socket-level control messages by invoking __sock_cmsg_send in ip_cmsg_send for control messages on the SOL_SOCKET layer. This makes sure whenever ip_cmsg_send is called in udp, icmp, and raw, we also process socket-level control messages. Note that this commit interprets new control messages that were ignored before. As such, this commit does not change the behavior of IPv4 control messages. Signed-off-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ip.h | 3 ++- net/ipv4/ip_sockglue.c | 9 ++++++++- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/udp.c | 3 +-- 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index fad74d323bd6..93725e546758 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -56,6 +56,7 @@ static inline unsigned int ip_hdrlen(const struct sk_buff *skb) } struct ipcm_cookie { + struct sockcm_cookie sockc; __be32 addr; int oif; struct ip_options_rcu *opt; @@ -550,7 +551,7 @@ int ip_options_rcv_srr(struct sk_buff *skb); void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb); void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, int offset); -int ip_cmsg_send(struct net *net, struct msghdr *msg, +int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6); int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 035ad645a8d9..1b7c0776c805 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -219,11 +219,12 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, } EXPORT_SYMBOL(ip_cmsg_recv_offset); -int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, +int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6) { int err, val; struct cmsghdr *cmsg; + struct net *net = sock_net(sk); for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) @@ -244,6 +245,12 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, continue; } #endif + if (cmsg->cmsg_level == SOL_SOCKET) { + if (__sock_cmsg_send(sk, msg, cmsg, &ipc->sockc)) + return -EINVAL; + continue; + } + if (cmsg->cmsg_level != SOL_IP) continue; switch (cmsg->cmsg_type) { diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index cf9700b1a106..670639bf9f7e 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -747,7 +747,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) sock_tx_timestamp(sk, &ipc.tx_flags); if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); + err = ip_cmsg_send(sk, msg, &ipc, false); if (unlikely(err)) { kfree(ipc.opt); return err; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8d22de74080c..088ce665fc7b 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -548,7 +548,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(net, msg, &ipc, false); + err = ip_cmsg_send(sk, msg, &ipc, false); if (unlikely(err)) { kfree(ipc.opt); goto out; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 08eed5e16df0..bccb4e11047a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1034,8 +1034,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) sock_tx_timestamp(sk, &ipc.tx_flags); if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc, - sk->sk_family == AF_INET6); + err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); if (unlikely(err)) { kfree(ipc.opt); return err; From ad1e46a837163a3e7160a1250825bcfafd2e714b Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sat, 2 Apr 2016 23:08:11 -0400 Subject: [PATCH 6/8] ipv6: process socket-level control messages in IPv6 Process socket-level control messages by invoking __sock_cmsg_send in ip6_datagram_send_ctl for control messages on the SOL_SOCKET layer. This makes sure whenever ip6_datagram_send_ctl is called for udp and raw, we also process socket-level control messages. This is a bit uglier than IPv4, since IPv6 does not have something like ipcm_cookie. Perhaps we can later create a control message cookie for IPv6? Note that this commit interprets new control messages that were ignored before. As such, this commit does not change the behavior of IPv6 control messages. Signed-off-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/transp_v6.h | 3 ++- net/ipv6/datagram.c | 9 ++++++++- net/ipv6/ip6_flowlabel.c | 3 ++- net/ipv6/ipv6_sockglue.c | 3 ++- net/ipv6/raw.c | 6 +++++- net/ipv6/udp.c | 5 ++++- net/l2tp/l2tp_ip6.c | 8 +++++--- 7 files changed, 28 insertions(+), 9 deletions(-) diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index b927413dde86..2b1c3450ab20 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -42,7 +42,8 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, struct flowi6 *fl6, struct ipv6_txoptions *opt, - int *hlimit, int *tclass, int *dontfrag); + int *hlimit, int *tclass, int *dontfrag, + struct sockcm_cookie *sockc); void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, __u16 destp, int bucket); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 428162155280..a73d70119fcd 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -685,7 +685,8 @@ EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl); int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, struct flowi6 *fl6, struct ipv6_txoptions *opt, - int *hlimit, int *tclass, int *dontfrag) + int *hlimit, int *tclass, int *dontfrag, + struct sockcm_cookie *sockc) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; @@ -702,6 +703,12 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, goto exit_f; } + if (cmsg->cmsg_level == SOL_SOCKET) { + if (__sock_cmsg_send(sk, msg, cmsg, sockc)) + return -EINVAL; + continue; + } + if (cmsg->cmsg_level != SOL_IPV6) continue; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index dc2db4f7b182..35d3ddc328f8 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -372,6 +372,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, if (olen > 0) { struct msghdr msg; struct flowi6 flowi6; + struct sockcm_cookie sockc_junk; int junk; err = -ENOMEM; @@ -390,7 +391,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, memset(&flowi6, 0, sizeof(flowi6)); err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, fl->opt, - &junk, &junk, &junk); + &junk, &junk, &junk, &sockc_junk); if (err) goto done; err = -EINVAL; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 4449ad1f8114..a5557d22f89e 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -471,6 +471,7 @@ sticky_done: struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi6 fl6; + struct sockcm_cookie sockc_junk; int junk; memset(&fl6, 0, sizeof(fl6)); @@ -503,7 +504,7 @@ sticky_done: msg.msg_control = (void *)(opt+1); retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, opt, &junk, - &junk, &junk); + &junk, &junk, &sockc_junk); if (retv) goto done; update: diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index fa59dd7a427e..f175ec0a97ce 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -745,6 +745,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct dst_entry *dst = NULL; struct raw6_frag_vec rfv; struct flowi6 fl6; + struct sockcm_cookie sockc; int addr_len = msg->msg_namelen; int hlimit = -1; int tclass = -1; @@ -821,13 +822,16 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (fl6.flowi6_oif == 0) fl6.flowi6_oif = sk->sk_bound_dev_if; + sockc.tsflags = 0; + if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, - &hlimit, &tclass, &dontfrag); + &hlimit, &tclass, &dontfrag, + &sockc); if (err < 0) { fl6_sock_release(flowlabel); return err; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 8125931106be..2a787af42163 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1128,6 +1128,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int connected = 0; int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); + struct sockcm_cookie sockc; /* destination address check */ if (sin6) { @@ -1247,6 +1248,7 @@ do_udp_sendmsg: fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; fl6.flowi6_mark = sk->sk_mark; + sockc.tsflags = 0; if (msg->msg_controllen) { opt = &opt_space; @@ -1254,7 +1256,8 @@ do_udp_sendmsg: opt->tot_len = sizeof(*opt); err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, - &hlimit, &tclass, &dontfrag); + &hlimit, &tclass, &dontfrag, + &sockc); if (err < 0) { fl6_sock_release(flowlabel); return err; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 6b54ff3ff4cb..4f29a4a0f360 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -492,6 +492,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; struct flowi6 fl6; + struct sockcm_cookie sockc_unused = {0}; int addr_len = msg->msg_namelen; int hlimit = -1; int tclass = -1; @@ -562,9 +563,10 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, - &hlimit, &tclass, &dontfrag); - if (err < 0) { + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, + &hlimit, &tclass, &dontfrag, + &sockc_unused); + if (err < 0) { fl6_sock_release(flowlabel); return err; } From c14ac9451c34832554db33386a4393be8bba3a7b Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sat, 2 Apr 2016 23:08:12 -0400 Subject: [PATCH 7/8] sock: enable timestamping using control messages Currently, SOL_TIMESTAMPING can only be enabled using setsockopt. This is very costly when users want to sample writes to gather tx timestamps. Add support for enabling SO_TIMESTAMPING via control messages by using tsflags added in `struct sockcm_cookie` (added in the previous patches in this series) to set the tx_flags of the last skb created in a sendmsg. With this patch, the timestamp recording bits in tx_flags of the skbuff is overridden if SO_TIMESTAMPING is passed in a cmsg. Please note that this is only effective for overriding the recording timestamps flags. Users should enable timestamp reporting (e.g., SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_OPT_ID) using socket options and then should ask for SOF_TIMESTAMPING_TX_* using control messages per sendmsg to sample timestamps for each write. Signed-off-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/tun.c | 3 ++- include/net/ipv6.h | 6 ++++-- include/net/sock.h | 10 ++++++---- net/can/raw.c | 2 +- net/ipv4/ping.c | 5 +++-- net/ipv4/raw.c | 11 ++++++----- net/ipv4/tcp.c | 20 +++++++++++++++----- net/ipv4/udp.c | 7 ++++--- net/ipv6/icmp.c | 6 ++++-- net/ipv6/ip6_output.c | 15 +++++++++------ net/ipv6/ping.c | 3 ++- net/ipv6/raw.c | 5 ++--- net/ipv6/udp.c | 7 ++++--- net/l2tp/l2tp_ip6.c | 2 +- net/packet/af_packet.c | 30 +++++++++++++++++++++++++----- net/socket.c | 10 +++++----- 16 files changed, 93 insertions(+), 49 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 510e90a6bb26..9abc36bf77ea 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -861,7 +861,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) goto drop; if (skb->sk && sk_fullsock(skb->sk)) { - sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags); + sock_tx_timestamp(skb->sk, skb->sk->sk_tsflags, + &skb_shinfo(skb)->tx_flags); sw_tx_timestamp(skb); } diff --git a/include/net/ipv6.h b/include/net/ipv6.h index d0aeb97aec5d..55ee1eb7d026 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -867,7 +867,8 @@ int ip6_append_data(struct sock *sk, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, - struct rt6_info *rt, unsigned int flags, int dontfrag); + struct rt6_info *rt, unsigned int flags, int dontfrag, + const struct sockcm_cookie *sockc); int ip6_push_pending_frames(struct sock *sk); @@ -884,7 +885,8 @@ struct sk_buff *ip6_make_skb(struct sock *sk, void *from, int length, int transhdrlen, int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, struct rt6_info *rt, - unsigned int flags, int dontfrag); + unsigned int flags, int dontfrag, + const struct sockcm_cookie *sockc); static inline struct sk_buff *ip6_finish_skb(struct sock *sk) { diff --git a/include/net/sock.h b/include/net/sock.h index af012da5e608..e91b87f54f99 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2057,19 +2057,21 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, sk->sk_stamp = skb->tstamp; } -void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags); +void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags); /** * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet + * @tsflags: timestamping flags to use * @tx_flags: completed with instructions for time stamping * * Note : callers should take care of initial *tx_flags value (usually 0) */ -static inline void sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) +static inline void sock_tx_timestamp(const struct sock *sk, __u16 tsflags, + __u8 *tx_flags) { - if (unlikely(sk->sk_tsflags)) - __sock_tx_timestamp(sk, tx_flags); + if (unlikely(tsflags)) + __sock_tx_timestamp(tsflags, tx_flags); if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) *tx_flags |= SKBTX_WIFI_STATUS; } diff --git a/net/can/raw.c b/net/can/raw.c index 2e67b1423cd3..972c187d40ab 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -755,7 +755,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (err < 0) goto free_skb; - sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + sock_tx_timestamp(sk, sk->sk_tsflags, &skb_shinfo(skb)->tx_flags); skb->dev = dev; skb->sk = sk; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 670639bf9f7e..66ddcb60519a 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -737,6 +737,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* no remote port */ } + ipc.sockc.tsflags = sk->sk_tsflags; ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.oif = sk->sk_bound_dev_if; @@ -744,8 +745,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.ttl = 0; ipc.tos = -1; - sock_tx_timestamp(sk, &ipc.tx_flags); - if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); if (unlikely(err)) { @@ -768,6 +767,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } + sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); + saddr = ipc.addr; ipc.addr = faddr = daddr; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 088ce665fc7b..438f50c1a676 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -339,8 +339,8 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, struct msghdr *msg, size_t length, - struct rtable **rtp, - unsigned int flags) + struct rtable **rtp, unsigned int flags, + const struct sockcm_cookie *sockc) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -379,7 +379,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->ip_summed = CHECKSUM_NONE; - sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); skb->transport_header = skb->network_header; err = -EFAULT; @@ -540,6 +540,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = inet->inet_daddr; } + ipc.sockc.tsflags = sk->sk_tsflags; ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.tx_flags = 0; @@ -638,10 +639,10 @@ back_from_confirm: if (inet->hdrincl) err = raw_send_hdrinc(sk, &fl4, msg, len, - &rt, msg->msg_flags); + &rt, msg->msg_flags, &ipc.sockc); else { - sock_tx_timestamp(sk, &ipc.tx_flags); + sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); if (!ipc.addr) ipc.addr = fl4.daddr; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ce3c9eb901a0..4d73858991af 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -428,13 +428,13 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); -static void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb) +static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) { - if (sk->sk_tsflags) { + if (sk->sk_tsflags || tsflags) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - sock_tx_timestamp(sk, &shinfo->tx_flags); + sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags); if (shinfo->tx_flags & SKBTX_ANY_TSTAMP) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; tcb->txstamp_ack = !!(shinfo->tx_flags & SKBTX_ACK_TSTAMP); @@ -959,7 +959,7 @@ new_segment: offset += copy; size -= copy; if (!size) { - tcp_tx_timestamp(sk, skb); + tcp_tx_timestamp(sk, sk->sk_tsflags, skb); goto out; } @@ -1079,6 +1079,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; + struct sockcm_cookie sockc; int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; bool sg; @@ -1121,6 +1122,15 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } + sockc.tsflags = sk->sk_tsflags; + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) { + err = -EINVAL; + goto out_err; + } + } + /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -1239,7 +1249,7 @@ new_segment: copied += copy; if (!msg_data_left(msg)) { - tcp_tx_timestamp(sk, skb); + tcp_tx_timestamp(sk, sockc.tsflags, skb); goto out; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index bccb4e11047a..45ff590661f4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1027,12 +1027,11 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) */ connected = 1; } + + ipc.sockc.tsflags = sk->sk_tsflags; ipc.addr = inet->inet_saddr; - ipc.oif = sk->sk_bound_dev_if; - sock_tx_timestamp(sk, &ipc.tx_flags); - if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); if (unlikely(err)) { @@ -1059,6 +1058,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) saddr = ipc.addr; ipc.addr = faddr = daddr; + sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); + if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) return -EINVAL; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 0a37ddc7af51..6b573ebe49de 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -400,6 +400,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; + struct sockcm_cookie sockc_unused = {0}; int iif = 0; int addr_type = 0; int len; @@ -527,7 +528,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), hlimit, np->tclass, NULL, &fl6, (struct rt6_info *)dst, - MSG_DONTWAIT, np->dontfrag); + MSG_DONTWAIT, np->dontfrag, &sockc_unused); if (err) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); @@ -566,6 +567,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) int hlimit; u8 tclass; u32 mark = IP6_REPLY_MARK(net, skb->mark); + struct sockcm_cookie sockc_unused = {0}; saddr = &ipv6_hdr(skb)->daddr; @@ -617,7 +619,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), hlimit, tclass, NULL, &fl6, (struct rt6_info *)dst, MSG_DONTWAIT, - np->dontfrag); + np->dontfrag, &sockc_unused); if (err) { ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTERRORS); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 9428345d3a07..612f3d138bf0 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1258,7 +1258,8 @@ static int __ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - unsigned int flags, int dontfrag) + unsigned int flags, int dontfrag, + const struct sockcm_cookie *sockc) { struct sk_buff *skb, *skb_prev = NULL; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu; @@ -1329,7 +1330,7 @@ emsgsize: csummode = CHECKSUM_PARTIAL; if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { - sock_tx_timestamp(sk, &tx_flags); + sock_tx_timestamp(sk, sockc->tsflags, &tx_flags); if (tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) tskey = sk->sk_tskey++; @@ -1565,7 +1566,8 @@ int ip6_append_data(struct sock *sk, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, - struct rt6_info *rt, unsigned int flags, int dontfrag) + struct rt6_info *rt, unsigned int flags, int dontfrag, + const struct sockcm_cookie *sockc) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -1593,7 +1595,8 @@ int ip6_append_data(struct sock *sk, return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, &np->cork, sk_page_frag(sk), getfrag, - from, length, transhdrlen, flags, dontfrag); + from, length, transhdrlen, flags, dontfrag, + sockc); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -1752,7 +1755,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags, - int dontfrag) + int dontfrag, const struct sockcm_cookie *sockc) { struct inet_cork_full cork; struct inet6_cork v6_cork; @@ -1779,7 +1782,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, - flags, dontfrag); + flags, dontfrag, sockc); if (err) { __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork); return ERR_PTR(err); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index c382db7a2e73..da1cff79e447 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -62,6 +62,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct dst_entry *dst; struct rt6_info *rt; struct pingfakehdr pfh; + struct sockcm_cookie junk = {0}; pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); @@ -144,7 +145,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) err = ip6_append_data(sk, ping_getfrag, &pfh, len, 0, hlimit, np->tclass, NULL, &fl6, rt, - MSG_DONTWAIT, np->dontfrag); + MSG_DONTWAIT, np->dontfrag, &junk); if (err) { ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index f175ec0a97ce..b07ce21983aa 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -822,8 +822,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (fl6.flowi6_oif == 0) fl6.flowi6_oif = sk->sk_bound_dev_if; - sockc.tsflags = 0; - + sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); @@ -901,7 +900,7 @@ back_from_confirm: lock_sock(sk); err = ip6_append_data(sk, raw6_getfrag, &rfv, len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info *)dst, - msg->msg_flags, dontfrag); + msg->msg_flags, dontfrag, &sockc); if (err) ip6_flush_pending_frames(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 2a787af42163..b772a7641fbd 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1248,7 +1248,7 @@ do_udp_sendmsg: fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; fl6.flowi6_mark = sk->sk_mark; - sockc.tsflags = 0; + sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { opt = &opt_space; @@ -1324,7 +1324,7 @@ back_from_confirm: skb = ip6_make_skb(sk, getfrag, msg, ulen, sizeof(struct udphdr), hlimit, tclass, opt, &fl6, (struct rt6_info *)dst, - msg->msg_flags, dontfrag); + msg->msg_flags, dontfrag, &sockc); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) err = udp_v6_send_skb(skb, &fl6); @@ -1351,7 +1351,8 @@ do_append_data: err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), hlimit, tclass, opt, &fl6, (struct rt6_info *)dst, - corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag); + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag, + &sockc); if (err) udp_v6_flush_pending_frames(sk); else if (!corkreq) diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 4f29a4a0f360..1a38f20b1ca6 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -627,7 +627,7 @@ back_from_confirm: err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, hlimit, tclass, opt, &fl6, (struct rt6_info *)dst, - msg->msg_flags, dontfrag); + msg->msg_flags, dontfrag, &sockc_unused); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 1ecfa710ca98..0007e23202e4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1837,6 +1837,7 @@ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name); struct sk_buff *skb = NULL; struct net_device *dev; + struct sockcm_cookie sockc; __be16 proto = 0; int err; int extra_len = 0; @@ -1925,12 +1926,21 @@ retry: goto out_unlock; } + sockc.tsflags = 0; + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) { + err = -EINVAL; + goto out_unlock; + } + } + skb->protocol = proto; skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags); if (unlikely(extra_len == 4)) skb->no_fcs = 1; @@ -2486,7 +2496,8 @@ static int packet_snd_vnet_gso(struct sk_buff *skb, static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, void *frame, struct net_device *dev, void *data, int tp_len, - __be16 proto, unsigned char *addr, int hlen, int copylen) + __be16 proto, unsigned char *addr, int hlen, int copylen, + const struct sockcm_cookie *sockc) { union tpacket_uhdr ph; int to_write, offset, len, nr_frags, len_max; @@ -2500,7 +2511,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->dev = dev; skb->priority = po->sk.sk_priority; skb->mark = po->sk.sk_mark; - sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags); + sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); skb_shinfo(skb)->destructor_arg = ph.raw; skb_reserve(skb, hlen); @@ -2624,6 +2635,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) struct sk_buff *skb; struct net_device *dev; struct virtio_net_hdr *vnet_hdr = NULL; + struct sockcm_cookie sockc; __be16 proto; int err, reserve = 0; void *ph; @@ -2655,6 +2667,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); } + sockc.tsflags = 0; + if (msg->msg_controllen) { + err = sock_cmsg_send(&po->sk, msg, &sockc); + if (unlikely(err)) + goto out; + } + err = -ENXIO; if (unlikely(dev == NULL)) goto out; @@ -2712,7 +2731,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) goto out_status; } tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto, - addr, hlen, copylen); + addr, hlen, copylen, &sockc); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && !po->has_vnet_hdr && @@ -2851,6 +2870,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; + sockc.tsflags = 0; sockc.mark = sk->sk_mark; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); @@ -2908,7 +2928,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_free; } - sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { diff --git a/net/socket.c b/net/socket.c index 5f77a8e93830..979d3146b081 100644 --- a/net/socket.c +++ b/net/socket.c @@ -587,20 +587,20 @@ void sock_release(struct socket *sock) } EXPORT_SYMBOL(sock_release); -void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) +void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags) { u8 flags = *tx_flags; - if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_HARDWARE) + if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) flags |= SKBTX_HW_TSTAMP; - if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) + if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) flags |= SKBTX_SW_TSTAMP; - if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED) + if (tsflags & SOF_TIMESTAMPING_TX_SCHED) flags |= SKBTX_SCHED_TSTAMP; - if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK) + if (tsflags & SOF_TIMESTAMPING_TX_ACK) flags |= SKBTX_ACK_TSTAMP; *tx_flags = flags; From fd91e12f594b40fdb2dad530e8b895cc5c07db21 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sat, 2 Apr 2016 23:08:13 -0400 Subject: [PATCH 8/8] sock: document timestamping via cmsg in Documentation Update docs and add code snippet for using cmsg for timestamping. Signed-off-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/networking/timestamping.txt | 48 +++++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt index a977339fbe0a..671cccf0dcd2 100644 --- a/Documentation/networking/timestamping.txt +++ b/Documentation/networking/timestamping.txt @@ -44,11 +44,17 @@ timeval of SO_TIMESTAMP (ms). Supports multiple types of timestamp requests. As a result, this socket option takes a bitmap of flags, not a boolean. In - err = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, (void *) val, &val); + err = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, (void *) val, + sizeof(val)); val is an integer with any of the following bits set. Setting other bit returns EINVAL and does not change the current state. +The socket option configures timestamp generation for individual +sk_buffs (1.3.1), timestamp reporting to the socket's error +queue (1.3.2) and options (1.3.3). Timestamp generation can also +be enabled for individual sendmsg calls using cmsg (1.3.4). + 1.3.1 Timestamp Generation @@ -71,13 +77,16 @@ SOF_TIMESTAMPING_RX_SOFTWARE: kernel receive stack. SOF_TIMESTAMPING_TX_HARDWARE: - Request tx timestamps generated by the network adapter. + Request tx timestamps generated by the network adapter. This flag + can be enabled via both socket options and control messages. SOF_TIMESTAMPING_TX_SOFTWARE: Request tx timestamps when data leaves the kernel. These timestamps are generated in the device driver as close as possible, but always prior to, passing the packet to the network interface. Hence, they require driver support and may not be available for all devices. + This flag can be enabled via both socket options and control messages. + SOF_TIMESTAMPING_TX_SCHED: Request tx timestamps prior to entering the packet scheduler. Kernel @@ -90,7 +99,8 @@ SOF_TIMESTAMPING_TX_SCHED: machines with virtual devices where a transmitted packet travels through multiple devices and, hence, multiple packet schedulers, a timestamp is generated at each layer. This allows for fine - grained measurement of queuing delay. + grained measurement of queuing delay. This flag can be enabled + via both socket options and control messages. SOF_TIMESTAMPING_TX_ACK: Request tx timestamps when all data in the send buffer has been @@ -99,6 +109,7 @@ SOF_TIMESTAMPING_TX_ACK: over-report measurement, because the timestamp is generated when all data up to and including the buffer at send() was acknowledged: the cumulative acknowledgment. The mechanism ignores SACK and FACK. + This flag can be enabled via both socket options and control messages. 1.3.2 Timestamp Reporting @@ -183,6 +194,37 @@ having access to the contents of the original packet, so cannot be combined with SOF_TIMESTAMPING_OPT_TSONLY. +1.3.4. Enabling timestamps via control messages + +In addition to socket options, timestamp generation can be requested +per write via cmsg, only for SOF_TIMESTAMPING_TX_* (see Section 1.3.1). +Using this feature, applications can sample timestamps per sendmsg() +without paying the overhead of enabling and disabling timestamps via +setsockopt: + + struct msghdr *msg; + ... + cmsg = CMSG_FIRSTHDR(msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SO_TIMESTAMPING; + cmsg->cmsg_len = CMSG_LEN(sizeof(__u32)); + *((__u32 *) CMSG_DATA(cmsg)) = SOF_TIMESTAMPING_TX_SCHED | + SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_TX_ACK; + err = sendmsg(fd, msg, 0); + +The SOF_TIMESTAMPING_TX_* flags set via cmsg will override +the SOF_TIMESTAMPING_TX_* flags set via setsockopt. + +Moreover, applications must still enable timestamp reporting via +setsockopt to receive timestamps: + + __u32 val = SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_ID /* or any other flag */; + err = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, (void *) val, + sizeof(val)); + + 1.4 Bytestream Timestamps The SO_TIMESTAMPING interface supports timestamping of bytes in a