net: Modify sk_alloc to not reference count the netns of kernel sockets.

Now that sk_alloc knows when a kernel socket is being allocated modify
it to not reference count the network namespace of kernel sockets.

Keep track of if a socket needs reference counting by adding a flag to
struct sock called sk_net_refcnt.

Update all of the callers of sock_create_kern to stop using
sk_change_net and sk_release_kernel as those hacks are no longer
needed, to avoid reference counting a kernel socket.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Eric W. Biederman 2015-05-08 21:10:31 -05:00 committed by David S. Miller
parent 11aa9c28b4
commit 26abe14379
8 changed files with 30 additions and 45 deletions

View File

@ -41,7 +41,7 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len,
static inline void inet_ctl_sock_destroy(struct sock *sk)
{
sk_release_kernel(sk);
sock_release(sk->sk_socket);
}
#endif

View File

@ -184,6 +184,7 @@ struct sock_common {
unsigned char skc_reuse:4;
unsigned char skc_reuseport:1;
unsigned char skc_ipv6only:1;
unsigned char skc_net_refcnt:1;
int skc_bound_dev_if;
union {
struct hlist_node skc_bind_node;
@ -323,6 +324,7 @@ struct sock {
#define sk_reuse __sk_common.skc_reuse
#define sk_reuseport __sk_common.skc_reuseport
#define sk_ipv6only __sk_common.skc_ipv6only
#define sk_net_refcnt __sk_common.skc_net_refcnt
#define sk_bound_dev_if __sk_common.skc_bound_dev_if
#define sk_bind_node __sk_common.skc_bind_node
#define sk_prot __sk_common.skc_prot

View File

@ -1412,7 +1412,10 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
*/
sk->sk_prot = sk->sk_prot_creator = prot;
sock_lock_init(sk);
sock_net_set(sk, get_net(net));
sk->sk_net_refcnt = kern ? 0 : 1;
if (likely(sk->sk_net_refcnt))
get_net(net);
sock_net_set(sk, net);
atomic_set(&sk->sk_wmem_alloc, 1);
sock_update_classid(sk);
@ -1446,6 +1449,7 @@ static void __sk_free(struct sock *sk)
if (sk->sk_peer_cred)
put_cred(sk->sk_peer_cred);
put_pid(sk->sk_peer_pid);
if (likely(sk->sk_net_refcnt))
put_net(sock_net(sk));
sk_prot_free(sk->sk_prot_creator, sk);
}

View File

@ -1430,7 +1430,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
struct net *net)
{
struct socket *sock;
int rc = sock_create_kern(&init_net, family, type, protocol, &sock);
int rc = sock_create_kern(net, family, type, protocol, &sock);
if (rc == 0) {
*sk = sock->sk;
@ -1440,8 +1440,6 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
* we do not wish this socket to see incoming packets.
*/
(*sk)->sk_prot->unhash(*sk);
sk_change_net(*sk, net);
}
return rc;
}

View File

@ -15,12 +15,10 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
struct socket *sock = NULL;
struct sockaddr_in udp_addr;
err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &sock);
err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock);
if (err < 0)
goto error;
sk_change_net(sock->sk, net);
udp_addr.sin_family = AF_INET;
udp_addr.sin_addr = cfg->local_ip;
udp_addr.sin_port = cfg->local_udp_port;
@ -47,7 +45,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
error:
if (sock) {
kernel_sock_shutdown(sock, SHUT_RDWR);
sk_release_kernel(sock->sk);
sock_release(sock);
}
*sockp = NULL;
return err;
@ -101,7 +99,7 @@ void udp_tunnel_sock_release(struct socket *sock)
{
rcu_assign_sk_user_data(sock->sk, NULL);
kernel_sock_shutdown(sock, SHUT_RDWR);
sk_release_kernel(sock->sk);
sock_release(sock);
}
EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);

View File

@ -19,12 +19,10 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
int err;
struct socket *sock = NULL;
err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM, 0, &sock);
err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, 0, &sock);
if (err < 0)
goto error;
sk_change_net(sock->sk, net);
udp6_addr.sin6_family = AF_INET6;
memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
sizeof(udp6_addr.sin6_addr));
@ -55,7 +53,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
error:
if (sock) {
kernel_sock_shutdown(sock, SHUT_RDWR);
sk_release_kernel(sock->sk);
sock_release(sock);
}
*sockp = NULL;
return err;

View File

@ -1334,9 +1334,10 @@ static void l2tp_tunnel_del_work(struct work_struct *work)
if (sock)
inet_shutdown(sock, 2);
} else {
if (sock)
if (sock) {
kernel_sock_shutdown(sock, SHUT_RDWR);
sk_release_kernel(sk);
sock_release(sock);
}
}
l2tp_tunnel_sock_put(sk);
@ -1399,13 +1400,11 @@ static int l2tp_tunnel_sock_create(struct net *net,
if (cfg->local_ip6 && cfg->peer_ip6) {
struct sockaddr_l2tpip6 ip6_addr = {0};
err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM,
err = sock_create_kern(net, AF_INET6, SOCK_DGRAM,
IPPROTO_L2TP, &sock);
if (err < 0)
goto out;
sk_change_net(sock->sk, net);
ip6_addr.l2tp_family = AF_INET6;
memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6,
sizeof(ip6_addr.l2tp_addr));
@ -1429,13 +1428,11 @@ static int l2tp_tunnel_sock_create(struct net *net,
{
struct sockaddr_l2tpip ip_addr = {0};
err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM,
err = sock_create_kern(net, AF_INET, SOCK_DGRAM,
IPPROTO_L2TP, &sock);
if (err < 0)
goto out;
sk_change_net(sock->sk, net);
ip_addr.l2tp_family = AF_INET;
ip_addr.l2tp_addr = cfg->local_ip;
ip_addr.l2tp_conn_id = tunnel_id;
@ -1462,7 +1459,7 @@ static int l2tp_tunnel_sock_create(struct net *net,
*sockp = sock;
if ((err < 0) && sock) {
kernel_sock_shutdown(sock, SHUT_RDWR);
sk_release_kernel(sock->sk);
sock_release(sock);
*sockp = NULL;
}

View File

@ -1457,18 +1457,12 @@ static struct socket *make_send_sock(struct net *net, int id)
struct socket *sock;
int result;
/* First create a socket move it to right name space later */
result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
/* First create a socket */
result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
return ERR_PTR(result);
}
/*
* Kernel sockets that are a part of a namespace, should not
* hold a reference to a namespace in order to allow to stop it.
* After sk_change_net should be released using sk_release_kernel.
*/
sk_change_net(sock->sk, net);
result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
if (result < 0) {
pr_err("Error setting outbound mcast interface\n");
@ -1497,7 +1491,7 @@ static struct socket *make_send_sock(struct net *net, int id)
return sock;
error:
sk_release_kernel(sock->sk);
sock_release(sock);
return ERR_PTR(result);
}
@ -1518,17 +1512,11 @@ static struct socket *make_receive_sock(struct net *net, int id)
int result;
/* First create a socket */
result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
return ERR_PTR(result);
}
/*
* Kernel sockets that are a part of a namespace, should not
* hold a reference to a namespace in order to allow to stop it.
* After sk_change_net should be released using sk_release_kernel.
*/
sk_change_net(sock->sk, net);
/* it is equivalent to the REUSEADDR option in user-space */
sock->sk->sk_reuse = SK_CAN_REUSE;
result = sysctl_sync_sock_size(ipvs);
@ -1554,7 +1542,7 @@ static struct socket *make_receive_sock(struct net *net, int id)
return sock;
error:
sk_release_kernel(sock->sk);
sock_release(sock);
return ERR_PTR(result);
}
@ -1692,7 +1680,7 @@ static int sync_thread_master(void *data)
ip_vs_sync_buff_release(sb);
/* release the sending multicast socket */
sk_release_kernel(tinfo->sock->sk);
sock_release(tinfo->sock);
kfree(tinfo);
return 0;
@ -1729,7 +1717,7 @@ static int sync_thread_backup(void *data)
}
/* release the sending multicast socket */
sk_release_kernel(tinfo->sock->sk);
sock_release(tinfo->sock);
kfree(tinfo->buf);
kfree(tinfo);
@ -1854,11 +1842,11 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
return 0;
outsocket:
sk_release_kernel(sock->sk);
sock_release(sock);
outtinfo:
if (tinfo) {
sk_release_kernel(tinfo->sock->sk);
sock_release(tinfo->sock);
kfree(tinfo->buf);
kfree(tinfo);
}