xref: /kernel/linux/linux-6.6/net/sched/act_ct.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2/* -
3 * net/sched/act_ct.c  Connection Tracking action
4 *
5 * Authors:   Paul Blakey <paulb@mellanox.com>
6 *            Yossi Kuperman <yossiku@mellanox.com>
7 *            Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
8 */
9
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/kernel.h>
13#include <linux/skbuff.h>
14#include <linux/rtnetlink.h>
15#include <linux/pkt_cls.h>
16#include <linux/ip.h>
17#include <linux/ipv6.h>
18#include <linux/rhashtable.h>
19#include <net/netlink.h>
20#include <net/pkt_sched.h>
21#include <net/pkt_cls.h>
22#include <net/act_api.h>
23#include <net/ip.h>
24#include <net/ipv6_frag.h>
25#include <uapi/linux/tc_act/tc_ct.h>
26#include <net/tc_act/tc_ct.h>
27#include <net/tc_wrapper.h>
28
29#include <net/netfilter/nf_flow_table.h>
30#include <net/netfilter/nf_conntrack.h>
31#include <net/netfilter/nf_conntrack_core.h>
32#include <net/netfilter/nf_conntrack_zones.h>
33#include <net/netfilter/nf_conntrack_helper.h>
34#include <net/netfilter/nf_conntrack_acct.h>
35#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
36#include <net/netfilter/nf_conntrack_act_ct.h>
37#include <net/netfilter/nf_conntrack_seqadj.h>
38#include <uapi/linux/netfilter/nf_nat.h>
39
40static struct workqueue_struct *act_ct_wq;
41static struct rhashtable zones_ht;
42static DEFINE_MUTEX(zones_mutex);
43
44struct tcf_ct_flow_table {
45	struct rhash_head node; /* In zones tables */
46
47	struct rcu_work rwork;
48	struct nf_flowtable nf_ft;
49	refcount_t ref;
50	u16 zone;
51
52	bool dying;
53};
54
55static const struct rhashtable_params zones_params = {
56	.head_offset = offsetof(struct tcf_ct_flow_table, node),
57	.key_offset = offsetof(struct tcf_ct_flow_table, zone),
58	.key_len = sizeof_field(struct tcf_ct_flow_table, zone),
59	.automatic_shrinking = true,
60};
61
62static struct flow_action_entry *
63tcf_ct_flow_table_flow_action_get_next(struct flow_action *flow_action)
64{
65	int i = flow_action->num_entries++;
66
67	return &flow_action->entries[i];
68}
69
70static void tcf_ct_add_mangle_action(struct flow_action *action,
71				     enum flow_action_mangle_base htype,
72				     u32 offset,
73				     u32 mask,
74				     u32 val)
75{
76	struct flow_action_entry *entry;
77
78	entry = tcf_ct_flow_table_flow_action_get_next(action);
79	entry->id = FLOW_ACTION_MANGLE;
80	entry->mangle.htype = htype;
81	entry->mangle.mask = ~mask;
82	entry->mangle.offset = offset;
83	entry->mangle.val = val;
84}
85
86/* The following nat helper functions check if the inverted reverse tuple
87 * (target) is different then the current dir tuple - meaning nat for ports
88 * and/or ip is needed, and add the relevant mangle actions.
89 */
90static void
91tcf_ct_flow_table_add_action_nat_ipv4(const struct nf_conntrack_tuple *tuple,
92				      struct nf_conntrack_tuple target,
93				      struct flow_action *action)
94{
95	if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
96		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
97					 offsetof(struct iphdr, saddr),
98					 0xFFFFFFFF,
99					 be32_to_cpu(target.src.u3.ip));
100	if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
101		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
102					 offsetof(struct iphdr, daddr),
103					 0xFFFFFFFF,
104					 be32_to_cpu(target.dst.u3.ip));
105}
106
107static void
108tcf_ct_add_ipv6_addr_mangle_action(struct flow_action *action,
109				   union nf_inet_addr *addr,
110				   u32 offset)
111{
112	int i;
113
114	for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++)
115		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP6,
116					 i * sizeof(u32) + offset,
117					 0xFFFFFFFF, be32_to_cpu(addr->ip6[i]));
118}
119
120static void
121tcf_ct_flow_table_add_action_nat_ipv6(const struct nf_conntrack_tuple *tuple,
122				      struct nf_conntrack_tuple target,
123				      struct flow_action *action)
124{
125	if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
126		tcf_ct_add_ipv6_addr_mangle_action(action, &target.src.u3,
127						   offsetof(struct ipv6hdr,
128							    saddr));
129	if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
130		tcf_ct_add_ipv6_addr_mangle_action(action, &target.dst.u3,
131						   offsetof(struct ipv6hdr,
132							    daddr));
133}
134
135static void
136tcf_ct_flow_table_add_action_nat_tcp(const struct nf_conntrack_tuple *tuple,
137				     struct nf_conntrack_tuple target,
138				     struct flow_action *action)
139{
140	__be16 target_src = target.src.u.tcp.port;
141	__be16 target_dst = target.dst.u.tcp.port;
142
143	if (target_src != tuple->src.u.tcp.port)
144		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
145					 offsetof(struct tcphdr, source),
146					 0xFFFF, be16_to_cpu(target_src));
147	if (target_dst != tuple->dst.u.tcp.port)
148		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
149					 offsetof(struct tcphdr, dest),
150					 0xFFFF, be16_to_cpu(target_dst));
151}
152
153static void
154tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple,
155				     struct nf_conntrack_tuple target,
156				     struct flow_action *action)
157{
158	__be16 target_src = target.src.u.udp.port;
159	__be16 target_dst = target.dst.u.udp.port;
160
161	if (target_src != tuple->src.u.udp.port)
162		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
163					 offsetof(struct udphdr, source),
164					 0xFFFF, be16_to_cpu(target_src));
165	if (target_dst != tuple->dst.u.udp.port)
166		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
167					 offsetof(struct udphdr, dest),
168					 0xFFFF, be16_to_cpu(target_dst));
169}
170
171static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct,
172					      enum ip_conntrack_dir dir,
173					      enum ip_conntrack_info ctinfo,
174					      struct flow_action *action)
175{
176	struct nf_conn_labels *ct_labels;
177	struct flow_action_entry *entry;
178	u32 *act_ct_labels;
179
180	entry = tcf_ct_flow_table_flow_action_get_next(action);
181	entry->id = FLOW_ACTION_CT_METADATA;
182#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
183	entry->ct_metadata.mark = READ_ONCE(ct->mark);
184#endif
185	/* aligns with the CT reference on the SKB nf_ct_set */
186	entry->ct_metadata.cookie = (unsigned long)ct | ctinfo;
187	entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL;
188
189	act_ct_labels = entry->ct_metadata.labels;
190	ct_labels = nf_ct_labels_find(ct);
191	if (ct_labels)
192		memcpy(act_ct_labels, ct_labels->bits, NF_CT_LABELS_MAX_SIZE);
193	else
194		memset(act_ct_labels, 0, NF_CT_LABELS_MAX_SIZE);
195}
196
197static int tcf_ct_flow_table_add_action_nat(struct net *net,
198					    struct nf_conn *ct,
199					    enum ip_conntrack_dir dir,
200					    struct flow_action *action)
201{
202	const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
203	struct nf_conntrack_tuple target;
204
205	if (!(ct->status & IPS_NAT_MASK))
206		return 0;
207
208	nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
209
210	switch (tuple->src.l3num) {
211	case NFPROTO_IPV4:
212		tcf_ct_flow_table_add_action_nat_ipv4(tuple, target,
213						      action);
214		break;
215	case NFPROTO_IPV6:
216		tcf_ct_flow_table_add_action_nat_ipv6(tuple, target,
217						      action);
218		break;
219	default:
220		return -EOPNOTSUPP;
221	}
222
223	switch (nf_ct_protonum(ct)) {
224	case IPPROTO_TCP:
225		tcf_ct_flow_table_add_action_nat_tcp(tuple, target, action);
226		break;
227	case IPPROTO_UDP:
228		tcf_ct_flow_table_add_action_nat_udp(tuple, target, action);
229		break;
230	default:
231		return -EOPNOTSUPP;
232	}
233
234	return 0;
235}
236
237static int tcf_ct_flow_table_fill_actions(struct net *net,
238					  struct flow_offload *flow,
239					  enum flow_offload_tuple_dir tdir,
240					  struct nf_flow_rule *flow_rule)
241{
242	struct flow_action *action = &flow_rule->rule->action;
243	int num_entries = action->num_entries;
244	struct nf_conn *ct = flow->ct;
245	enum ip_conntrack_info ctinfo;
246	enum ip_conntrack_dir dir;
247	int i, err;
248
249	switch (tdir) {
250	case FLOW_OFFLOAD_DIR_ORIGINAL:
251		dir = IP_CT_DIR_ORIGINAL;
252		ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
253			IP_CT_ESTABLISHED : IP_CT_NEW;
254		if (ctinfo == IP_CT_ESTABLISHED)
255			set_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
256		break;
257	case FLOW_OFFLOAD_DIR_REPLY:
258		dir = IP_CT_DIR_REPLY;
259		ctinfo = IP_CT_ESTABLISHED_REPLY;
260		break;
261	default:
262		return -EOPNOTSUPP;
263	}
264
265	err = tcf_ct_flow_table_add_action_nat(net, ct, dir, action);
266	if (err)
267		goto err_nat;
268
269	tcf_ct_flow_table_add_action_meta(ct, dir, ctinfo, action);
270	return 0;
271
272err_nat:
273	/* Clear filled actions */
274	for (i = num_entries; i < action->num_entries; i++)
275		memset(&action->entries[i], 0, sizeof(action->entries[i]));
276	action->num_entries = num_entries;
277
278	return err;
279}
280
281static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow)
282{
283	return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) &&
284	       test_bit(IPS_HW_OFFLOAD_BIT, &flow->ct->status) &&
285	       !test_bit(NF_FLOW_HW_PENDING, &flow->flags) &&
286	       !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
287}
288
289static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft);
290
291static void tcf_ct_nf_get(struct nf_flowtable *ft)
292{
293	struct tcf_ct_flow_table *ct_ft =
294		container_of(ft, struct tcf_ct_flow_table, nf_ft);
295
296	tcf_ct_flow_table_get_ref(ct_ft);
297}
298
299static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft);
300
301static void tcf_ct_nf_put(struct nf_flowtable *ft)
302{
303	struct tcf_ct_flow_table *ct_ft =
304		container_of(ft, struct tcf_ct_flow_table, nf_ft);
305
306	tcf_ct_flow_table_put(ct_ft);
307}
308
309static struct nf_flowtable_type flowtable_ct = {
310	.gc		= tcf_ct_flow_is_outdated,
311	.action		= tcf_ct_flow_table_fill_actions,
312	.get		= tcf_ct_nf_get,
313	.put		= tcf_ct_nf_put,
314	.owner		= THIS_MODULE,
315};
316
317static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params)
318{
319	struct tcf_ct_flow_table *ct_ft;
320	int err = -ENOMEM;
321
322	mutex_lock(&zones_mutex);
323	ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
324	if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
325		goto out_unlock;
326
327	ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
328	if (!ct_ft)
329		goto err_alloc;
330	refcount_set(&ct_ft->ref, 1);
331
332	ct_ft->zone = params->zone;
333	err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
334	if (err)
335		goto err_insert;
336
337	ct_ft->nf_ft.type = &flowtable_ct;
338	ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD |
339			      NF_FLOWTABLE_COUNTER;
340	err = nf_flow_table_init(&ct_ft->nf_ft);
341	if (err)
342		goto err_init;
343	write_pnet(&ct_ft->nf_ft.net, net);
344
345	__module_get(THIS_MODULE);
346out_unlock:
347	params->ct_ft = ct_ft;
348	params->nf_ft = &ct_ft->nf_ft;
349	mutex_unlock(&zones_mutex);
350
351	return 0;
352
353err_init:
354	rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
355err_insert:
356	kfree(ct_ft);
357err_alloc:
358	mutex_unlock(&zones_mutex);
359	return err;
360}
361
362static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft)
363{
364	refcount_inc(&ct_ft->ref);
365}
366
367static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
368{
369	struct tcf_ct_flow_table *ct_ft;
370	struct flow_block *block;
371
372	ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
373			     rwork);
374	nf_flow_table_free(&ct_ft->nf_ft);
375
376	block = &ct_ft->nf_ft.flow_block;
377	down_write(&ct_ft->nf_ft.flow_block_lock);
378	WARN_ON(!list_empty(&block->cb_list));
379	up_write(&ct_ft->nf_ft.flow_block_lock);
380	kfree(ct_ft);
381
382	module_put(THIS_MODULE);
383}
384
385static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft)
386{
387	if (refcount_dec_and_test(&ct_ft->ref)) {
388		rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
389		INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
390		queue_rcu_work(act_ct_wq, &ct_ft->rwork);
391	}
392}
393
394static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry,
395				 struct nf_conn_act_ct_ext *act_ct_ext, u8 dir)
396{
397	entry->tuplehash[dir].tuple.xmit_type = FLOW_OFFLOAD_XMIT_TC;
398	entry->tuplehash[dir].tuple.tc.iifidx = act_ct_ext->ifindex[dir];
399}
400
401static void tcf_ct_flow_ct_ext_ifidx_update(struct flow_offload *entry)
402{
403	struct nf_conn_act_ct_ext *act_ct_ext;
404
405	act_ct_ext = nf_conn_act_ct_ext_find(entry->ct);
406	if (act_ct_ext) {
407		tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL);
408		tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY);
409	}
410}
411
412static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
413				  struct nf_conn *ct,
414				  bool tcp, bool bidirectional)
415{
416	struct nf_conn_act_ct_ext *act_ct_ext;
417	struct flow_offload *entry;
418	int err;
419
420	if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
421		return;
422
423	entry = flow_offload_alloc(ct);
424	if (!entry) {
425		WARN_ON_ONCE(1);
426		goto err_alloc;
427	}
428
429	if (tcp) {
430		ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
431		ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
432	}
433	if (bidirectional)
434		__set_bit(NF_FLOW_HW_BIDIRECTIONAL, &entry->flags);
435
436	act_ct_ext = nf_conn_act_ct_ext_find(ct);
437	if (act_ct_ext) {
438		tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL);
439		tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY);
440	}
441
442	err = flow_offload_add(&ct_ft->nf_ft, entry);
443	if (err)
444		goto err_add;
445
446	return;
447
448err_add:
449	flow_offload_free(entry);
450err_alloc:
451	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
452}
453
454static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
455					   struct nf_conn *ct,
456					   enum ip_conntrack_info ctinfo)
457{
458	bool tcp = false, bidirectional = true;
459
460	switch (nf_ct_protonum(ct)) {
461	case IPPROTO_TCP:
462		if ((ctinfo != IP_CT_ESTABLISHED &&
463		     ctinfo != IP_CT_ESTABLISHED_REPLY) ||
464		    !test_bit(IPS_ASSURED_BIT, &ct->status) ||
465		    ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
466			return;
467
468		tcp = true;
469		break;
470	case IPPROTO_UDP:
471		if (!nf_ct_is_confirmed(ct))
472			return;
473		if (!test_bit(IPS_ASSURED_BIT, &ct->status))
474			bidirectional = false;
475		break;
476#ifdef CONFIG_NF_CT_PROTO_GRE
477	case IPPROTO_GRE: {
478		struct nf_conntrack_tuple *tuple;
479
480		if ((ctinfo != IP_CT_ESTABLISHED &&
481		     ctinfo != IP_CT_ESTABLISHED_REPLY) ||
482		    !test_bit(IPS_ASSURED_BIT, &ct->status) ||
483		    ct->status & IPS_NAT_MASK)
484			return;
485
486		tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
487		/* No support for GRE v1 */
488		if (tuple->src.u.gre.key || tuple->dst.u.gre.key)
489			return;
490		break;
491	}
492#endif
493	default:
494		return;
495	}
496
497	if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
498	    ct->status & IPS_SEQ_ADJUST)
499		return;
500
501	tcf_ct_flow_table_add(ct_ft, ct, tcp, bidirectional);
502}
503
504static bool
505tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
506				  struct flow_offload_tuple *tuple,
507				  struct tcphdr **tcph)
508{
509	struct flow_ports *ports;
510	unsigned int thoff;
511	struct iphdr *iph;
512	size_t hdrsize;
513	u8 ipproto;
514
515	if (!pskb_network_may_pull(skb, sizeof(*iph)))
516		return false;
517
518	iph = ip_hdr(skb);
519	thoff = iph->ihl * 4;
520
521	if (ip_is_fragment(iph) ||
522	    unlikely(thoff != sizeof(struct iphdr)))
523		return false;
524
525	ipproto = iph->protocol;
526	switch (ipproto) {
527	case IPPROTO_TCP:
528		hdrsize = sizeof(struct tcphdr);
529		break;
530	case IPPROTO_UDP:
531		hdrsize = sizeof(*ports);
532		break;
533#ifdef CONFIG_NF_CT_PROTO_GRE
534	case IPPROTO_GRE:
535		hdrsize = sizeof(struct gre_base_hdr);
536		break;
537#endif
538	default:
539		return false;
540	}
541
542	if (iph->ttl <= 1)
543		return false;
544
545	if (!pskb_network_may_pull(skb, thoff + hdrsize))
546		return false;
547
548	switch (ipproto) {
549	case IPPROTO_TCP:
550		*tcph = (void *)(skb_network_header(skb) + thoff);
551		fallthrough;
552	case IPPROTO_UDP:
553		ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
554		tuple->src_port = ports->source;
555		tuple->dst_port = ports->dest;
556		break;
557	case IPPROTO_GRE: {
558		struct gre_base_hdr *greh;
559
560		greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
561		if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
562			return false;
563		break;
564	}
565	}
566
567	iph = ip_hdr(skb);
568
569	tuple->src_v4.s_addr = iph->saddr;
570	tuple->dst_v4.s_addr = iph->daddr;
571	tuple->l3proto = AF_INET;
572	tuple->l4proto = ipproto;
573
574	return true;
575}
576
577static bool
578tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
579				  struct flow_offload_tuple *tuple,
580				  struct tcphdr **tcph)
581{
582	struct flow_ports *ports;
583	struct ipv6hdr *ip6h;
584	unsigned int thoff;
585	size_t hdrsize;
586	u8 nexthdr;
587
588	if (!pskb_network_may_pull(skb, sizeof(*ip6h)))
589		return false;
590
591	ip6h = ipv6_hdr(skb);
592	thoff = sizeof(*ip6h);
593
594	nexthdr = ip6h->nexthdr;
595	switch (nexthdr) {
596	case IPPROTO_TCP:
597		hdrsize = sizeof(struct tcphdr);
598		break;
599	case IPPROTO_UDP:
600		hdrsize = sizeof(*ports);
601		break;
602#ifdef CONFIG_NF_CT_PROTO_GRE
603	case IPPROTO_GRE:
604		hdrsize = sizeof(struct gre_base_hdr);
605		break;
606#endif
607	default:
608		return false;
609	}
610
611	if (ip6h->hop_limit <= 1)
612		return false;
613
614	if (!pskb_network_may_pull(skb, thoff + hdrsize))
615		return false;
616
617	switch (nexthdr) {
618	case IPPROTO_TCP:
619		*tcph = (void *)(skb_network_header(skb) + thoff);
620		fallthrough;
621	case IPPROTO_UDP:
622		ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
623		tuple->src_port = ports->source;
624		tuple->dst_port = ports->dest;
625		break;
626	case IPPROTO_GRE: {
627		struct gre_base_hdr *greh;
628
629		greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
630		if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
631			return false;
632		break;
633	}
634	}
635
636	ip6h = ipv6_hdr(skb);
637
638	tuple->src_v6 = ip6h->saddr;
639	tuple->dst_v6 = ip6h->daddr;
640	tuple->l3proto = AF_INET6;
641	tuple->l4proto = nexthdr;
642
643	return true;
644}
645
646static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
647				     struct sk_buff *skb,
648				     u8 family)
649{
650	struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
651	struct flow_offload_tuple_rhash *tuplehash;
652	struct flow_offload_tuple tuple = {};
653	enum ip_conntrack_info ctinfo;
654	struct tcphdr *tcph = NULL;
655	bool force_refresh = false;
656	struct flow_offload *flow;
657	struct nf_conn *ct;
658	u8 dir;
659
660	switch (family) {
661	case NFPROTO_IPV4:
662		if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph))
663			return false;
664		break;
665	case NFPROTO_IPV6:
666		if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple, &tcph))
667			return false;
668		break;
669	default:
670		return false;
671	}
672
673	tuplehash = flow_offload_lookup(nf_ft, &tuple);
674	if (!tuplehash)
675		return false;
676
677	dir = tuplehash->tuple.dir;
678	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
679	ct = flow->ct;
680
681	if (dir == FLOW_OFFLOAD_DIR_REPLY &&
682	    !test_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags)) {
683		/* Only offload reply direction after connection became
684		 * assured.
685		 */
686		if (test_bit(IPS_ASSURED_BIT, &ct->status))
687			set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags);
688		else if (test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags))
689			/* If flow_table flow has already been updated to the
690			 * established state, then don't refresh.
691			 */
692			return false;
693		force_refresh = true;
694	}
695
696	if (tcph && (unlikely(tcph->fin || tcph->rst))) {
697		flow_offload_teardown(flow);
698		return false;
699	}
700
701	if (dir == FLOW_OFFLOAD_DIR_ORIGINAL)
702		ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
703			IP_CT_ESTABLISHED : IP_CT_NEW;
704	else
705		ctinfo = IP_CT_ESTABLISHED_REPLY;
706
707	nf_conn_act_ct_ext_fill(skb, ct, ctinfo);
708	tcf_ct_flow_ct_ext_ifidx_update(flow);
709	flow_offload_refresh(nf_ft, flow, force_refresh);
710	if (!test_bit(IPS_ASSURED_BIT, &ct->status)) {
711		/* Process this flow in SW to allow promoting to ASSURED */
712		return false;
713	}
714
715	nf_conntrack_get(&ct->ct_general);
716	nf_ct_set(skb, ct, ctinfo);
717	if (nf_ft->flags & NF_FLOWTABLE_COUNTER)
718		nf_ct_acct_update(ct, dir, skb->len);
719
720	return true;
721}
722
723static int tcf_ct_flow_tables_init(void)
724{
725	return rhashtable_init(&zones_ht, &zones_params);
726}
727
728static void tcf_ct_flow_tables_uninit(void)
729{
730	rhashtable_destroy(&zones_ht);
731}
732
733static struct tc_action_ops act_ct_ops;
734
735struct tc_ct_action_net {
736	struct tc_action_net tn; /* Must be first */
737	bool labels;
738};
739
740/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
741static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
742				   struct tcf_ct_params *p)
743{
744	enum ip_conntrack_info ctinfo;
745	struct nf_conn *ct;
746
747	ct = nf_ct_get(skb, &ctinfo);
748	if (!ct)
749		return false;
750	if (!net_eq(net, read_pnet(&ct->ct_net)))
751		goto drop_ct;
752	if (nf_ct_zone(ct)->id != p->zone)
753		goto drop_ct;
754	if (p->helper) {
755		struct nf_conn_help *help;
756
757		help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
758		if (help && rcu_access_pointer(help->helper) != p->helper)
759			goto drop_ct;
760	}
761
762	/* Force conntrack entry direction. */
763	if ((p->ct_action & TCA_CT_ACT_FORCE) &&
764	    CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
765		if (nf_ct_is_confirmed(ct))
766			nf_ct_kill(ct);
767
768		goto drop_ct;
769	}
770
771	return true;
772
773drop_ct:
774	nf_ct_put(ct);
775	nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
776
777	return false;
778}
779
780static u8 tcf_ct_skb_nf_family(struct sk_buff *skb)
781{
782	u8 family = NFPROTO_UNSPEC;
783
784	switch (skb_protocol(skb, true)) {
785	case htons(ETH_P_IP):
786		family = NFPROTO_IPV4;
787		break;
788	case htons(ETH_P_IPV6):
789		family = NFPROTO_IPV6;
790		break;
791	default:
792		break;
793	}
794
795	return family;
796}
797
798static int tcf_ct_ipv4_is_fragment(struct sk_buff *skb, bool *frag)
799{
800	unsigned int len;
801
802	len =  skb_network_offset(skb) + sizeof(struct iphdr);
803	if (unlikely(skb->len < len))
804		return -EINVAL;
805	if (unlikely(!pskb_may_pull(skb, len)))
806		return -ENOMEM;
807
808	*frag = ip_is_fragment(ip_hdr(skb));
809	return 0;
810}
811
812static int tcf_ct_ipv6_is_fragment(struct sk_buff *skb, bool *frag)
813{
814	unsigned int flags = 0, len, payload_ofs = 0;
815	unsigned short frag_off;
816	int nexthdr;
817
818	len =  skb_network_offset(skb) + sizeof(struct ipv6hdr);
819	if (unlikely(skb->len < len))
820		return -EINVAL;
821	if (unlikely(!pskb_may_pull(skb, len)))
822		return -ENOMEM;
823
824	nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags);
825	if (unlikely(nexthdr < 0))
826		return -EPROTO;
827
828	*frag = flags & IP6_FH_F_FRAG;
829	return 0;
830}
831
832static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
833				   u8 family, u16 zone, bool *defrag)
834{
835	enum ip_conntrack_info ctinfo;
836	struct nf_conn *ct;
837	int err = 0;
838	bool frag;
839	u8 proto;
840	u16 mru;
841
842	/* Previously seen (loopback)? Ignore. */
843	ct = nf_ct_get(skb, &ctinfo);
844	if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
845		return 0;
846
847	if (family == NFPROTO_IPV4)
848		err = tcf_ct_ipv4_is_fragment(skb, &frag);
849	else
850		err = tcf_ct_ipv6_is_fragment(skb, &frag);
851	if (err || !frag)
852		return err;
853
854	err = nf_ct_handle_fragments(net, skb, zone, family, &proto, &mru);
855	if (err)
856		return err;
857
858	*defrag = true;
859	tc_skb_cb(skb)->mru = mru;
860
861	return 0;
862}
863
864static void tcf_ct_params_free(struct tcf_ct_params *params)
865{
866	if (params->helper) {
867#if IS_ENABLED(CONFIG_NF_NAT)
868		if (params->ct_action & TCA_CT_ACT_NAT)
869			nf_nat_helper_put(params->helper);
870#endif
871		nf_conntrack_helper_put(params->helper);
872	}
873	if (params->ct_ft)
874		tcf_ct_flow_table_put(params->ct_ft);
875	if (params->tmpl)
876		nf_ct_put(params->tmpl);
877	kfree(params);
878}
879
880static void tcf_ct_params_free_rcu(struct rcu_head *head)
881{
882	struct tcf_ct_params *params;
883
884	params = container_of(head, struct tcf_ct_params, rcu);
885	tcf_ct_params_free(params);
886}
887
888static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask)
889{
890#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
891	u32 new_mark;
892
893	if (!mask)
894		return;
895
896	new_mark = mark | (READ_ONCE(ct->mark) & ~(mask));
897	if (READ_ONCE(ct->mark) != new_mark) {
898		WRITE_ONCE(ct->mark, new_mark);
899		if (nf_ct_is_confirmed(ct))
900			nf_conntrack_event_cache(IPCT_MARK, ct);
901	}
902#endif
903}
904
905static void tcf_ct_act_set_labels(struct nf_conn *ct,
906				  u32 *labels,
907				  u32 *labels_m)
908{
909#if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)
910	size_t labels_sz = sizeof_field(struct tcf_ct_params, labels);
911
912	if (!memchr_inv(labels_m, 0, labels_sz))
913		return;
914
915	nf_connlabels_replace(ct, labels, labels_m, 4);
916#endif
917}
918
919static int tcf_ct_act_nat(struct sk_buff *skb,
920			  struct nf_conn *ct,
921			  enum ip_conntrack_info ctinfo,
922			  int ct_action,
923			  struct nf_nat_range2 *range,
924			  bool commit)
925{
926#if IS_ENABLED(CONFIG_NF_NAT)
927	int err, action = 0;
928
929	if (!(ct_action & TCA_CT_ACT_NAT))
930		return NF_ACCEPT;
931	if (ct_action & TCA_CT_ACT_NAT_SRC)
932		action |= BIT(NF_NAT_MANIP_SRC);
933	if (ct_action & TCA_CT_ACT_NAT_DST)
934		action |= BIT(NF_NAT_MANIP_DST);
935
936	err = nf_ct_nat(skb, ct, ctinfo, &action, range, commit);
937
938	if (action & BIT(NF_NAT_MANIP_SRC))
939		tc_skb_cb(skb)->post_ct_snat = 1;
940	if (action & BIT(NF_NAT_MANIP_DST))
941		tc_skb_cb(skb)->post_ct_dnat = 1;
942
943	return err;
944#else
945	return NF_ACCEPT;
946#endif
947}
948
949TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
950				 struct tcf_result *res)
951{
952	struct net *net = dev_net(skb->dev);
953	enum ip_conntrack_info ctinfo;
954	struct tcf_ct *c = to_ct(a);
955	struct nf_conn *tmpl = NULL;
956	struct nf_hook_state state;
957	bool cached, commit, clear;
958	int nh_ofs, err, retval;
959	struct tcf_ct_params *p;
960	bool add_helper = false;
961	bool skip_add = false;
962	bool defrag = false;
963	struct nf_conn *ct;
964	u8 family;
965
966	p = rcu_dereference_bh(c->params);
967
968	retval = READ_ONCE(c->tcf_action);
969	commit = p->ct_action & TCA_CT_ACT_COMMIT;
970	clear = p->ct_action & TCA_CT_ACT_CLEAR;
971	tmpl = p->tmpl;
972
973	tcf_lastuse_update(&c->tcf_tm);
974	tcf_action_update_bstats(&c->common, skb);
975
976	if (clear) {
977		tc_skb_cb(skb)->post_ct = false;
978		ct = nf_ct_get(skb, &ctinfo);
979		if (ct) {
980			nf_ct_put(ct);
981			nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
982		}
983
984		goto out_clear;
985	}
986
987	family = tcf_ct_skb_nf_family(skb);
988	if (family == NFPROTO_UNSPEC)
989		goto drop;
990
991	/* The conntrack module expects to be working at L3.
992	 * We also try to pull the IPv4/6 header to linear area
993	 */
994	nh_ofs = skb_network_offset(skb);
995	skb_pull_rcsum(skb, nh_ofs);
996	err = tcf_ct_handle_fragments(net, skb, family, p->zone, &defrag);
997	if (err)
998		goto out_frag;
999
1000	err = nf_ct_skb_network_trim(skb, family);
1001	if (err)
1002		goto drop;
1003
1004	/* If we are recirculating packets to match on ct fields and
1005	 * committing with a separate ct action, then we don't need to
1006	 * actually run the packet through conntrack twice unless it's for a
1007	 * different zone.
1008	 */
1009	cached = tcf_ct_skb_nfct_cached(net, skb, p);
1010	if (!cached) {
1011		if (tcf_ct_flow_table_lookup(p, skb, family)) {
1012			skip_add = true;
1013			goto do_nat;
1014		}
1015
1016		/* Associate skb with specified zone. */
1017		if (tmpl) {
1018			nf_conntrack_put(skb_nfct(skb));
1019			nf_conntrack_get(&tmpl->ct_general);
1020			nf_ct_set(skb, tmpl, IP_CT_NEW);
1021		}
1022
1023		state.hook = NF_INET_PRE_ROUTING;
1024		state.net = net;
1025		state.pf = family;
1026		err = nf_conntrack_in(skb, &state);
1027		if (err != NF_ACCEPT)
1028			goto out_push;
1029	}
1030
1031do_nat:
1032	ct = nf_ct_get(skb, &ctinfo);
1033	if (!ct)
1034		goto out_push;
1035	nf_ct_deliver_cached_events(ct);
1036	nf_conn_act_ct_ext_fill(skb, ct, ctinfo);
1037
1038	err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit);
1039	if (err != NF_ACCEPT)
1040		goto drop;
1041
1042	if (!nf_ct_is_confirmed(ct) && commit && p->helper && !nfct_help(ct)) {
1043		err = __nf_ct_try_assign_helper(ct, p->tmpl, GFP_ATOMIC);
1044		if (err)
1045			goto drop;
1046		add_helper = true;
1047		if (p->ct_action & TCA_CT_ACT_NAT && !nfct_seqadj(ct)) {
1048			if (!nfct_seqadj_ext_add(ct))
1049				goto drop;
1050		}
1051	}
1052
1053	if (nf_ct_is_confirmed(ct) ? ((!cached && !skip_add) || add_helper) : commit) {
1054		if (nf_ct_helper(skb, ct, ctinfo, family) != NF_ACCEPT)
1055			goto drop;
1056	}
1057
1058	if (commit) {
1059		tcf_ct_act_set_mark(ct, p->mark, p->mark_mask);
1060		tcf_ct_act_set_labels(ct, p->labels, p->labels_mask);
1061
1062		if (!nf_ct_is_confirmed(ct))
1063			nf_conn_act_ct_ext_add(skb, ct, ctinfo);
1064
1065		/* This will take care of sending queued events
1066		 * even if the connection is already confirmed.
1067		 */
1068		if (nf_conntrack_confirm(skb) != NF_ACCEPT)
1069			goto drop;
1070	}
1071
1072	if (!skip_add)
1073		tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
1074
1075out_push:
1076	skb_push_rcsum(skb, nh_ofs);
1077
1078	tc_skb_cb(skb)->post_ct = true;
1079	tc_skb_cb(skb)->zone = p->zone;
1080out_clear:
1081	if (defrag)
1082		qdisc_skb_cb(skb)->pkt_len = skb->len;
1083	return retval;
1084
1085out_frag:
1086	if (err != -EINPROGRESS)
1087		tcf_action_inc_drop_qstats(&c->common);
1088	return TC_ACT_CONSUMED;
1089
1090drop:
1091	tcf_action_inc_drop_qstats(&c->common);
1092	return TC_ACT_SHOT;
1093}
1094
1095static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
1096	[TCA_CT_ACTION] = { .type = NLA_U16 },
1097	[TCA_CT_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_ct)),
1098	[TCA_CT_ZONE] = { .type = NLA_U16 },
1099	[TCA_CT_MARK] = { .type = NLA_U32 },
1100	[TCA_CT_MARK_MASK] = { .type = NLA_U32 },
1101	[TCA_CT_LABELS] = { .type = NLA_BINARY,
1102			    .len = 128 / BITS_PER_BYTE },
1103	[TCA_CT_LABELS_MASK] = { .type = NLA_BINARY,
1104				 .len = 128 / BITS_PER_BYTE },
1105	[TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 },
1106	[TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 },
1107	[TCA_CT_NAT_IPV6_MIN] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
1108	[TCA_CT_NAT_IPV6_MAX] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
1109	[TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 },
1110	[TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 },
1111	[TCA_CT_HELPER_NAME] = { .type = NLA_STRING, .len = NF_CT_HELPER_NAME_LEN },
1112	[TCA_CT_HELPER_FAMILY] = { .type = NLA_U8 },
1113	[TCA_CT_HELPER_PROTO] = { .type = NLA_U8 },
1114};
1115
1116static int tcf_ct_fill_params_nat(struct tcf_ct_params *p,
1117				  struct tc_ct *parm,
1118				  struct nlattr **tb,
1119				  struct netlink_ext_ack *extack)
1120{
1121	struct nf_nat_range2 *range;
1122
1123	if (!(p->ct_action & TCA_CT_ACT_NAT))
1124		return 0;
1125
1126	if (!IS_ENABLED(CONFIG_NF_NAT)) {
1127		NL_SET_ERR_MSG_MOD(extack, "Netfilter nat isn't enabled in kernel");
1128		return -EOPNOTSUPP;
1129	}
1130
1131	if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
1132		return 0;
1133
1134	if ((p->ct_action & TCA_CT_ACT_NAT_SRC) &&
1135	    (p->ct_action & TCA_CT_ACT_NAT_DST)) {
1136		NL_SET_ERR_MSG_MOD(extack, "dnat and snat can't be enabled at the same time");
1137		return -EOPNOTSUPP;
1138	}
1139
1140	range = &p->range;
1141	if (tb[TCA_CT_NAT_IPV4_MIN]) {
1142		struct nlattr *max_attr = tb[TCA_CT_NAT_IPV4_MAX];
1143
1144		p->ipv4_range = true;
1145		range->flags |= NF_NAT_RANGE_MAP_IPS;
1146		range->min_addr.ip =
1147			nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]);
1148
1149		range->max_addr.ip = max_attr ?
1150				     nla_get_in_addr(max_attr) :
1151				     range->min_addr.ip;
1152	} else if (tb[TCA_CT_NAT_IPV6_MIN]) {
1153		struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX];
1154
1155		p->ipv4_range = false;
1156		range->flags |= NF_NAT_RANGE_MAP_IPS;
1157		range->min_addr.in6 =
1158			nla_get_in6_addr(tb[TCA_CT_NAT_IPV6_MIN]);
1159
1160		range->max_addr.in6 = max_attr ?
1161				      nla_get_in6_addr(max_attr) :
1162				      range->min_addr.in6;
1163	}
1164
1165	if (tb[TCA_CT_NAT_PORT_MIN]) {
1166		range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
1167		range->min_proto.all = nla_get_be16(tb[TCA_CT_NAT_PORT_MIN]);
1168
1169		range->max_proto.all = tb[TCA_CT_NAT_PORT_MAX] ?
1170				       nla_get_be16(tb[TCA_CT_NAT_PORT_MAX]) :
1171				       range->min_proto.all;
1172	}
1173
1174	return 0;
1175}
1176
1177static void tcf_ct_set_key_val(struct nlattr **tb,
1178			       void *val, int val_type,
1179			       void *mask, int mask_type,
1180			       int len)
1181{
1182	if (!tb[val_type])
1183		return;
1184	nla_memcpy(val, tb[val_type], len);
1185
1186	if (!mask)
1187		return;
1188
1189	if (mask_type == TCA_CT_UNSPEC || !tb[mask_type])
1190		memset(mask, 0xff, len);
1191	else
1192		nla_memcpy(mask, tb[mask_type], len);
1193}
1194
1195static int tcf_ct_fill_params(struct net *net,
1196			      struct tcf_ct_params *p,
1197			      struct tc_ct *parm,
1198			      struct nlattr **tb,
1199			      struct netlink_ext_ack *extack)
1200{
1201	struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id);
1202	struct nf_conntrack_zone zone;
1203	int err, family, proto, len;
1204	struct nf_conn *tmpl;
1205	char *name;
1206
1207	p->zone = NF_CT_DEFAULT_ZONE_ID;
1208
1209	tcf_ct_set_key_val(tb,
1210			   &p->ct_action, TCA_CT_ACTION,
1211			   NULL, TCA_CT_UNSPEC,
1212			   sizeof(p->ct_action));
1213
1214	if (p->ct_action & TCA_CT_ACT_CLEAR)
1215		return 0;
1216
1217	err = tcf_ct_fill_params_nat(p, parm, tb, extack);
1218	if (err)
1219		return err;
1220
1221	if (tb[TCA_CT_MARK]) {
1222		if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) {
1223			NL_SET_ERR_MSG_MOD(extack, "Conntrack mark isn't enabled.");
1224			return -EOPNOTSUPP;
1225		}
1226		tcf_ct_set_key_val(tb,
1227				   &p->mark, TCA_CT_MARK,
1228				   &p->mark_mask, TCA_CT_MARK_MASK,
1229				   sizeof(p->mark));
1230	}
1231
1232	if (tb[TCA_CT_LABELS]) {
1233		if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
1234			NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled.");
1235			return -EOPNOTSUPP;
1236		}
1237
1238		if (!tn->labels) {
1239			NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length");
1240			return -EOPNOTSUPP;
1241		}
1242		tcf_ct_set_key_val(tb,
1243				   p->labels, TCA_CT_LABELS,
1244				   p->labels_mask, TCA_CT_LABELS_MASK,
1245				   sizeof(p->labels));
1246	}
1247
1248	if (tb[TCA_CT_ZONE]) {
1249		if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
1250			NL_SET_ERR_MSG_MOD(extack, "Conntrack zones isn't enabled.");
1251			return -EOPNOTSUPP;
1252		}
1253
1254		tcf_ct_set_key_val(tb,
1255				   &p->zone, TCA_CT_ZONE,
1256				   NULL, TCA_CT_UNSPEC,
1257				   sizeof(p->zone));
1258	}
1259
1260	nf_ct_zone_init(&zone, p->zone, NF_CT_DEFAULT_ZONE_DIR, 0);
1261	tmpl = nf_ct_tmpl_alloc(net, &zone, GFP_KERNEL);
1262	if (!tmpl) {
1263		NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template");
1264		return -ENOMEM;
1265	}
1266	p->tmpl = tmpl;
1267	if (tb[TCA_CT_HELPER_NAME]) {
1268		name = nla_data(tb[TCA_CT_HELPER_NAME]);
1269		len = nla_len(tb[TCA_CT_HELPER_NAME]);
1270		if (len > 16 || name[len - 1] != '\0') {
1271			NL_SET_ERR_MSG_MOD(extack, "Failed to parse helper name.");
1272			err = -EINVAL;
1273			goto err;
1274		}
1275		family = tb[TCA_CT_HELPER_FAMILY] ? nla_get_u8(tb[TCA_CT_HELPER_FAMILY]) : AF_INET;
1276		proto = tb[TCA_CT_HELPER_PROTO] ? nla_get_u8(tb[TCA_CT_HELPER_PROTO]) : IPPROTO_TCP;
1277		err = nf_ct_add_helper(tmpl, name, family, proto,
1278				       p->ct_action & TCA_CT_ACT_NAT, &p->helper);
1279		if (err) {
1280			NL_SET_ERR_MSG_MOD(extack, "Failed to add helper");
1281			goto err;
1282		}
1283	}
1284
1285	if (p->ct_action & TCA_CT_ACT_COMMIT)
1286		__set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
1287	return 0;
1288err:
1289	nf_ct_put(p->tmpl);
1290	p->tmpl = NULL;
1291	return err;
1292}
1293
1294static int tcf_ct_init(struct net *net, struct nlattr *nla,
1295		       struct nlattr *est, struct tc_action **a,
1296		       struct tcf_proto *tp, u32 flags,
1297		       struct netlink_ext_ack *extack)
1298{
1299	struct tc_action_net *tn = net_generic(net, act_ct_ops.net_id);
1300	bool bind = flags & TCA_ACT_FLAGS_BIND;
1301	struct tcf_ct_params *params = NULL;
1302	struct nlattr *tb[TCA_CT_MAX + 1];
1303	struct tcf_chain *goto_ch = NULL;
1304	struct tc_ct *parm;
1305	struct tcf_ct *c;
1306	int err, res = 0;
1307	u32 index;
1308
1309	if (!nla) {
1310		NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed");
1311		return -EINVAL;
1312	}
1313
1314	err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack);
1315	if (err < 0)
1316		return err;
1317
1318	if (!tb[TCA_CT_PARMS]) {
1319		NL_SET_ERR_MSG_MOD(extack, "Missing required ct parameters");
1320		return -EINVAL;
1321	}
1322	parm = nla_data(tb[TCA_CT_PARMS]);
1323	index = parm->index;
1324	err = tcf_idr_check_alloc(tn, &index, a, bind);
1325	if (err < 0)
1326		return err;
1327
1328	if (!err) {
1329		err = tcf_idr_create_from_flags(tn, index, est, a,
1330						&act_ct_ops, bind, flags);
1331		if (err) {
1332			tcf_idr_cleanup(tn, index);
1333			return err;
1334		}
1335		res = ACT_P_CREATED;
1336	} else {
1337		if (bind)
1338			return 0;
1339
1340		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
1341			tcf_idr_release(*a, bind);
1342			return -EEXIST;
1343		}
1344	}
1345	err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
1346	if (err < 0)
1347		goto cleanup;
1348
1349	c = to_ct(*a);
1350
1351	params = kzalloc(sizeof(*params), GFP_KERNEL);
1352	if (unlikely(!params)) {
1353		err = -ENOMEM;
1354		goto cleanup;
1355	}
1356
1357	err = tcf_ct_fill_params(net, params, parm, tb, extack);
1358	if (err)
1359		goto cleanup;
1360
1361	err = tcf_ct_flow_table_get(net, params);
1362	if (err)
1363		goto cleanup;
1364
1365	spin_lock_bh(&c->tcf_lock);
1366	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
1367	params = rcu_replace_pointer(c->params, params,
1368				     lockdep_is_held(&c->tcf_lock));
1369	spin_unlock_bh(&c->tcf_lock);
1370
1371	if (goto_ch)
1372		tcf_chain_put_by_act(goto_ch);
1373	if (params)
1374		call_rcu(&params->rcu, tcf_ct_params_free_rcu);
1375
1376	return res;
1377
1378cleanup:
1379	if (goto_ch)
1380		tcf_chain_put_by_act(goto_ch);
1381	if (params)
1382		tcf_ct_params_free(params);
1383	tcf_idr_release(*a, bind);
1384	return err;
1385}
1386
1387static void tcf_ct_cleanup(struct tc_action *a)
1388{
1389	struct tcf_ct_params *params;
1390	struct tcf_ct *c = to_ct(a);
1391
1392	params = rcu_dereference_protected(c->params, 1);
1393	if (params)
1394		call_rcu(&params->rcu, tcf_ct_params_free_rcu);
1395}
1396
1397static int tcf_ct_dump_key_val(struct sk_buff *skb,
1398			       void *val, int val_type,
1399			       void *mask, int mask_type,
1400			       int len)
1401{
1402	int err;
1403
1404	if (mask && !memchr_inv(mask, 0, len))
1405		return 0;
1406
1407	err = nla_put(skb, val_type, len, val);
1408	if (err)
1409		return err;
1410
1411	if (mask_type != TCA_CT_UNSPEC) {
1412		err = nla_put(skb, mask_type, len, mask);
1413		if (err)
1414			return err;
1415	}
1416
1417	return 0;
1418}
1419
1420static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p)
1421{
1422	struct nf_nat_range2 *range = &p->range;
1423
1424	if (!(p->ct_action & TCA_CT_ACT_NAT))
1425		return 0;
1426
1427	if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
1428		return 0;
1429
1430	if (range->flags & NF_NAT_RANGE_MAP_IPS) {
1431		if (p->ipv4_range) {
1432			if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MIN,
1433					    range->min_addr.ip))
1434				return -1;
1435			if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MAX,
1436					    range->max_addr.ip))
1437				return -1;
1438		} else {
1439			if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MIN,
1440					     &range->min_addr.in6))
1441				return -1;
1442			if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MAX,
1443					     &range->max_addr.in6))
1444				return -1;
1445		}
1446	}
1447
1448	if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
1449		if (nla_put_be16(skb, TCA_CT_NAT_PORT_MIN,
1450				 range->min_proto.all))
1451			return -1;
1452		if (nla_put_be16(skb, TCA_CT_NAT_PORT_MAX,
1453				 range->max_proto.all))
1454			return -1;
1455	}
1456
1457	return 0;
1458}
1459
1460static int tcf_ct_dump_helper(struct sk_buff *skb, struct nf_conntrack_helper *helper)
1461{
1462	if (!helper)
1463		return 0;
1464
1465	if (nla_put_string(skb, TCA_CT_HELPER_NAME, helper->name) ||
1466	    nla_put_u8(skb, TCA_CT_HELPER_FAMILY, helper->tuple.src.l3num) ||
1467	    nla_put_u8(skb, TCA_CT_HELPER_PROTO, helper->tuple.dst.protonum))
1468		return -1;
1469
1470	return 0;
1471}
1472
1473static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
1474			      int bind, int ref)
1475{
1476	unsigned char *b = skb_tail_pointer(skb);
1477	struct tcf_ct *c = to_ct(a);
1478	struct tcf_ct_params *p;
1479
1480	struct tc_ct opt = {
1481		.index   = c->tcf_index,
1482		.refcnt  = refcount_read(&c->tcf_refcnt) - ref,
1483		.bindcnt = atomic_read(&c->tcf_bindcnt) - bind,
1484	};
1485	struct tcf_t t;
1486
1487	spin_lock_bh(&c->tcf_lock);
1488	p = rcu_dereference_protected(c->params,
1489				      lockdep_is_held(&c->tcf_lock));
1490	opt.action = c->tcf_action;
1491
1492	if (tcf_ct_dump_key_val(skb,
1493				&p->ct_action, TCA_CT_ACTION,
1494				NULL, TCA_CT_UNSPEC,
1495				sizeof(p->ct_action)))
1496		goto nla_put_failure;
1497
1498	if (p->ct_action & TCA_CT_ACT_CLEAR)
1499		goto skip_dump;
1500
1501	if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
1502	    tcf_ct_dump_key_val(skb,
1503				&p->mark, TCA_CT_MARK,
1504				&p->mark_mask, TCA_CT_MARK_MASK,
1505				sizeof(p->mark)))
1506		goto nla_put_failure;
1507
1508	if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
1509	    tcf_ct_dump_key_val(skb,
1510				p->labels, TCA_CT_LABELS,
1511				p->labels_mask, TCA_CT_LABELS_MASK,
1512				sizeof(p->labels)))
1513		goto nla_put_failure;
1514
1515	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
1516	    tcf_ct_dump_key_val(skb,
1517				&p->zone, TCA_CT_ZONE,
1518				NULL, TCA_CT_UNSPEC,
1519				sizeof(p->zone)))
1520		goto nla_put_failure;
1521
1522	if (tcf_ct_dump_nat(skb, p))
1523		goto nla_put_failure;
1524
1525	if (tcf_ct_dump_helper(skb, p->helper))
1526		goto nla_put_failure;
1527
1528skip_dump:
1529	if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt))
1530		goto nla_put_failure;
1531
1532	tcf_tm_dump(&t, &c->tcf_tm);
1533	if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD))
1534		goto nla_put_failure;
1535	spin_unlock_bh(&c->tcf_lock);
1536
1537	return skb->len;
1538nla_put_failure:
1539	spin_unlock_bh(&c->tcf_lock);
1540	nlmsg_trim(skb, b);
1541	return -1;
1542}
1543
1544static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets,
1545			     u64 drops, u64 lastuse, bool hw)
1546{
1547	struct tcf_ct *c = to_ct(a);
1548
1549	tcf_action_update_stats(a, bytes, packets, drops, hw);
1550	c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse);
1551}
1552
1553static int tcf_ct_offload_act_setup(struct tc_action *act, void *entry_data,
1554				    u32 *index_inc, bool bind,
1555				    struct netlink_ext_ack *extack)
1556{
1557	if (bind) {
1558		struct flow_action_entry *entry = entry_data;
1559
1560		if (tcf_ct_helper(act))
1561			return -EOPNOTSUPP;
1562
1563		entry->id = FLOW_ACTION_CT;
1564		entry->ct.action = tcf_ct_action(act);
1565		entry->ct.zone = tcf_ct_zone(act);
1566		entry->ct.flow_table = tcf_ct_ft(act);
1567		*index_inc = 1;
1568	} else {
1569		struct flow_offload_action *fl_action = entry_data;
1570
1571		fl_action->id = FLOW_ACTION_CT;
1572	}
1573
1574	return 0;
1575}
1576
1577static struct tc_action_ops act_ct_ops = {
1578	.kind		=	"ct",
1579	.id		=	TCA_ID_CT,
1580	.owner		=	THIS_MODULE,
1581	.act		=	tcf_ct_act,
1582	.dump		=	tcf_ct_dump,
1583	.init		=	tcf_ct_init,
1584	.cleanup	=	tcf_ct_cleanup,
1585	.stats_update	=	tcf_stats_update,
1586	.offload_act_setup =	tcf_ct_offload_act_setup,
1587	.size		=	sizeof(struct tcf_ct),
1588};
1589
1590static __net_init int ct_init_net(struct net *net)
1591{
1592	unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8;
1593	struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id);
1594
1595	if (nf_connlabels_get(net, n_bits - 1)) {
1596		tn->labels = false;
1597		pr_err("act_ct: Failed to set connlabels length");
1598	} else {
1599		tn->labels = true;
1600	}
1601
1602	return tc_action_net_init(net, &tn->tn, &act_ct_ops);
1603}
1604
1605static void __net_exit ct_exit_net(struct list_head *net_list)
1606{
1607	struct net *net;
1608
1609	rtnl_lock();
1610	list_for_each_entry(net, net_list, exit_list) {
1611		struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id);
1612
1613		if (tn->labels)
1614			nf_connlabels_put(net);
1615	}
1616	rtnl_unlock();
1617
1618	tc_action_net_exit(net_list, act_ct_ops.net_id);
1619}
1620
1621static struct pernet_operations ct_net_ops = {
1622	.init = ct_init_net,
1623	.exit_batch = ct_exit_net,
1624	.id   = &act_ct_ops.net_id,
1625	.size = sizeof(struct tc_ct_action_net),
1626};
1627
1628static int __init ct_init_module(void)
1629{
1630	int err;
1631
1632	act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
1633	if (!act_ct_wq)
1634		return -ENOMEM;
1635
1636	err = tcf_ct_flow_tables_init();
1637	if (err)
1638		goto err_tbl_init;
1639
1640	err = tcf_register_action(&act_ct_ops, &ct_net_ops);
1641	if (err)
1642		goto err_register;
1643
1644	static_branch_inc(&tcf_frag_xmit_count);
1645
1646	return 0;
1647
1648err_register:
1649	tcf_ct_flow_tables_uninit();
1650err_tbl_init:
1651	destroy_workqueue(act_ct_wq);
1652	return err;
1653}
1654
1655static void __exit ct_cleanup_module(void)
1656{
1657	static_branch_dec(&tcf_frag_xmit_count);
1658	tcf_unregister_action(&act_ct_ops, &ct_net_ops);
1659	tcf_ct_flow_tables_uninit();
1660	destroy_workqueue(act_ct_wq);
1661}
1662
1663module_init(ct_init_module);
1664module_exit(ct_cleanup_module);
1665MODULE_AUTHOR("Paul Blakey <paulb@mellanox.com>");
1666MODULE_AUTHOR("Yossi Kuperman <yossiku@mellanox.com>");
1667MODULE_AUTHOR("Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>");
1668MODULE_DESCRIPTION("Connection tracking action");
1669MODULE_LICENSE("GPL v2");
1670