SNAT失败为什么会表现为超时

SNAT失败为什么会表现为超时

intro

在之前的测试中,如果严格限制了所有SNAT的源修改为特定IP+PORT,将所有发往本机137.0.01:8080的tcp都SNAT到127.0.0.1:40000端口。

tsecer@harry: sudo iptables -t nat -A POSTROUTING -p tcp -d 127.0.0.1 --dport 8080 -j SNAT --to-source 127.0.0.1:40000

直观的看,这种配置肯定是有问题的:明显以为限制了SNAT源的选择,所以最多只能完成一个连接的NAT。

毫不意外,测试的时候就会发现:只能有一个连接成功,之后的连接尝试都会在发送SYN包之后进入SYN_SENT状态,并且最终因为没有收到对方回包而超时。

因为发起connect的socket处于SYN_SENT状态,说明socket是分配并发送了握手包;但是通过tcpdump抓包却没有看到这个报文离开主机进入网络,这又说明报文在本机中丢失了。

一个自然而然的问题就是:这个同步包去哪里了?

分配

实现中有一个细节:先修改选择结果,然后判断如果没在使用则返回;反过来说:**如果所有都被用完,返回的是最后一次尝试的(已经被占用的)那个地址(IP + PORT)。

another_round:

for (i = 0; i < attempts; i++, off++) {

*keyptr = htons(min + off % range_size);

if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))

return;

xt_snat_target_v2>>nf_nat_setup_info>>get_unique_tuple==>>nf_nat_l4proto_unique_tuple

/* Alter the per-proto part of the tuple (depending on maniptype), to

* give a unique tuple in the given range if possible.

*

* Per-protocol part of tuple is initialized to the incoming packet.

*/

static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,

const struct nf_nat_range2 *range,

enum nf_nat_manip_type maniptype,

const struct nf_conn *ct)

{

unsigned int range_size, min, max, i, attempts;

__be16 *keyptr;

u16 off;

switch (tuple->dst.protonum) {

case IPPROTO_ICMP:

case IPPROTO_ICMPV6:

/* id is same for either direction... */

keyptr = &tuple->src.u.icmp.id;

if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {

min = 0;

range_size = 65536;

} else {

min = ntohs(range->min_proto.icmp.id);

range_size = ntohs(range->max_proto.icmp.id) -

ntohs(range->min_proto.icmp.id) + 1;

}

goto find_free_id;

#if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE)

case IPPROTO_GRE:

/* If there is no master conntrack we are not PPTP,

do not change tuples */

if (!ct->master)

return;

if (maniptype == NF_NAT_MANIP_SRC)

keyptr = &tuple->src.u.gre.key;

else

keyptr = &tuple->dst.u.gre.key;

if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {

min = 1;

range_size = 65535;

} else {

min = ntohs(range->min_proto.gre.key);

range_size = ntohs(range->max_proto.gre.key) - min + 1;

}

goto find_free_id;

#endif

case IPPROTO_UDP:

case IPPROTO_UDPLITE:

case IPPROTO_TCP:

case IPPROTO_SCTP:

case IPPROTO_DCCP:

if (maniptype == NF_NAT_MANIP_SRC)

keyptr = &tuple->src.u.all;

else

keyptr = &tuple->dst.u.all;

break;

default:

return;

}

/* If no range specified... */

if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {

/* If it's dst rewrite, can't change port */

if (maniptype == NF_NAT_MANIP_DST)

return;

if (ntohs(*keyptr) < 1024) {

/* Loose convention: >> 512 is credential passing */

if (ntohs(*keyptr) < 512) {

min = 1;

range_size = 511 - min + 1;

} else {

min = 600;

range_size = 1023 - min + 1;

}

} else {

min = 1024;

range_size = 65535 - 1024 + 1;

}

} else {

min = ntohs(range->min_proto.all);

max = ntohs(range->max_proto.all);

if (unlikely(max < min))

swap(max, min);

range_size = max - min + 1;

}

find_free_id:

if (range->flags & NF_NAT_RANGE_PROTO_OFFSET)

off = (ntohs(*keyptr) - ntohs(range->base_proto.all));

else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) ||

maniptype != NF_NAT_MANIP_DST)

off = get_random_u16();

else

off = 0;

attempts = range_size;

if (attempts > NF_NAT_MAX_ATTEMPTS)

attempts = NF_NAT_MAX_ATTEMPTS;

/* We are in softirq; doing a search of the entire range risks

* soft lockup when all tuples are already used.

*

* If we can't find any free port from first offset, pick a new

* one and try again, with ever smaller search window.

*/

another_round:

for (i = 0; i < attempts; i++, off++) {

*keyptr = htons(min + off % range_size);

if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))

return;

}

if (attempts >= range_size || attempts < 16)

return;

attempts /= 2;

off = get_random_u16();

goto another_round;

}

确认

前面分配了IP+PORT是否就意味着conntrack就真的SNAT成功了呢?显然不是,因为如果建立成功就以为这逻辑错误。以开始的测试SNAT为例,当收到回包时,五元组(proto, srcip, srcport, dstip, dstport)都是相同的(tcp, 127.0.0.1, 8080, 127.0.0.1, 40000)无法区分到底属于当前主机上哪个socket(端口)。

nf_conntrack模块(nf_conntrack_proto.c)在中NF_INET_POST_ROUTING chain中注册了nf_confirm hook函数。

///@file: nf_conntrack_proto.c

/* Connection tracking may drop packets, but never alters them, so

* make it the first hook.

*/

static const struct nf_hook_ops ipv4_conntrack_ops[] = {

{

.hook = ipv4_conntrack_in,

.pf = NFPROTO_IPV4,

.hooknum = NF_INET_PRE_ROUTING,

.priority = NF_IP_PRI_CONNTRACK,

},

{

.hook = ipv4_conntrack_local,

.pf = NFPROTO_IPV4,

.hooknum = NF_INET_LOCAL_OUT,

.priority = NF_IP_PRI_CONNTRACK,

},

{

.hook = nf_confirm,

.pf = NFPROTO_IPV4,

.hooknum = NF_INET_POST_ROUTING,

.priority = NF_IP_PRI_CONNTRACK_CONFIRM,

},

{

.hook = nf_confirm,

.pf = NFPROTO_IPV4,

.hooknum = NF_INET_LOCAL_IN,

.priority = NF_IP_PRI_CONNTRACK_CONFIRM,

},

};

NF_IP_PRI_CONNTRACK_CONFIRM的优先级最低(数值最大:INT_MAX),所以它会在所有的target执行完之后再执行。

enum nf_ip_hook_priorities {

NF_IP_PRI_FIRST = INT_MIN,

NF_IP_PRI_RAW_BEFORE_DEFRAG = -450,

NF_IP_PRI_CONNTRACK_DEFRAG = -400,

NF_IP_PRI_RAW = -300,

NF_IP_PRI_SELINUX_FIRST = -225,

NF_IP_PRI_CONNTRACK = -200,

NF_IP_PRI_MANGLE = -150,

NF_IP_PRI_NAT_DST = -100,

NF_IP_PRI_FILTER = 0,

NF_IP_PRI_SECURITY = 50,

NF_IP_PRI_NAT_SRC = 100,

NF_IP_PRI_SELINUX_LAST = 225,

NF_IP_PRI_CONNTRACK_HELPER = 300,

NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX,

NF_IP_PRI_LAST = INT_MAX,

};

当有冲突项时,调用nf_ct_resolve_***函数尝试解决冲突。

nf_confirm>>nf_conntrack_confirm>>__nf_conntrack_confirm

/* Confirm a connection given skb; places it in hash table */

int

__nf_conntrack_confirm(struct sk_buff *skb)

{

///...

max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN);

/* See if there's one in the list already, including reverse:

NAT could have grabbed it without realizing, since we're

not in the hash. If there is, we lost race. */

hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {

if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,

zone, net))

goto out;

if (chainlen++ > max_chainlen)

goto chaintoolong;

}

chainlen = 0;

hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {

if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,

zone, net))

goto out;

if (chainlen++ > max_chainlen) {

chaintoolong:

NF_CT_STAT_INC(net, chaintoolong);

NF_CT_STAT_INC(net, insert_failed);

ret = NF_DROP;

goto dying;

}

}

///...

out:

ret = nf_ct_resolve_***(skb, h, reply_hash);

dying:

nf_conntrack_double_unlock(hash, reply_hash);

local_bh_enable();

return ret;

}

由于nf_ct_resolve_***通常nf_ct_resolve_***并不能解决冲突,所以会返回NF_DROP,同时会递增drop和insert_failed事件。

drop:

NF_CT_STAT_INC(net, drop);

NF_CT_STAT_INC(net, insert_failed);

也就是说,会丢掉这个报文(struct sk_buff *skb),并且不会创建conntrack条目。

/**

* nf_ct_resolve_*** - attempt to handle *** without packet drop

*

* @skb: skb that causes the ***

* @h: tuplehash of the clashing entry already in table

* @reply_hash: hash slot for reply direction

*

* A conntrack entry can be inserted to the connection tracking table

* if there is no existing entry with an identical tuple.

*

* If there is one, @skb (and the assocated, unconfirmed conntrack) has

* to be dropped. In case @skb is retransmitted, next conntrack lookup

* will find the already-existing entry.

*

* The major problem with such packet drop is the extra delay added by

* the packet loss -- it will take some time for a retransmit to occur

* (or the sender to time out when waiting for a reply).

*

* This function attempts to handle the situation without packet drop.

*

* If @skb has no NAT transformation or if the colliding entries are

* exactly the same, only the to-be-confirmed conntrack entry is discarded

* and @skb is associated with the conntrack entry already in the table.

*

* Failing that, the new, unconfirmed conntrack is still added to the table

* provided that the collision only occurs in the ORIGINAL direction.

* The new entry will be added only in the non-clashing REPLY direction,

* so packets in the ORIGINAL direction will continue to match the existing

* entry. The new entry will also have a fixed timeout so it expires --

* due to the collision, it will only see reply traffic.

*

* Returns NF_DROP if the *** could not be resolved.

*/

static __cold noinline int

nf_ct_resolve_***(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,

u32 reply_hash)

{

/* This is the conntrack entry already in hashes that won race. */

struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);

const struct nf_conntrack_l4proto *l4proto;

enum ip_conntrack_info ctinfo;

struct nf_conn *loser_ct;

struct net *net;

int ret;

loser_ct = nf_ct_get(skb, &ctinfo);

net = nf_ct_net(loser_ct);

l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));

if (!l4proto->allow_***)

goto drop;

ret = __nf_ct_resolve_***(skb, h);

if (ret == NF_ACCEPT)

return ret;

ret = nf_ct_resolve_***_harder(skb, reply_hash);

if (ret == NF_ACCEPT)

return ret;

drop:

NF_CT_STAT_INC(net, drop);

NF_CT_STAT_INC(net, insert_failed);

return NF_DROP;

}

验证

执行conntrack -S的输出,可以看到cpu=15中的insert_failed/drop不断增加,从1一直到7,并且变化的间隔也越来越大。这个也对应着启动tcp连接的syn包重试的二倍增加策略。

tsecer@harry: watch conntrack -S

tsecer@harry:

cpu=0 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=1 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=2 found=0 invalid=0 ignore=2 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=3 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=4 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=2

cpu=5 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=6 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=7 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=8 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=9 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=10 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=11 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=12 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=13 found=0 invalid=6 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=14 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=15 found=7 invalid=0 ignore=0 insert=0 insert_failed=7 drop=7 early_drop=0 error=0 search_restart=0

cpu=16 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=1

cpu=17 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=18 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=19 found=0 invalid=0 ignore=2 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=20 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=21 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=22 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=23 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=2

cpu=24 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=25 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=26 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=27 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=28 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=29 found=0 invalid=0 ignore=3 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=30 found=0 invalid=6 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

cpu=31 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

通过netstat -s可以看到,其中有其次TCPTimeouts,对应启动连接时的7次syn包重传;这7次重传对应一次因为超时导致的连接失败(1 connections aborted due to timeout)。

tsecer@harry: netstat -s

Ip:

Forwarding: 2

11631 total packets received

......

TcpExt:

390 TCP sockets finished time wait in fast timer

10 delayed acks sent

1 delayed acks further delayed because of locked socket

Quick ack mode was activated 45 times

1375 packet headers predicted

2695 acknowledgments not containing data payload received

845 predicted acknowledgments

TCPTimeouts: 7

TCPLossProbes: 3

TCPDSACKOldSent: 45

TCPDSACKRecv: 1

8 connections reset due to unexpected data

23 connections reset due to early user close

1 connections aborted due to timeout

参考文档

该文章指出了可以通过conntrack -S查看输出中的“insert_failed”状态;并且这篇文章同时推荐使用的--random-fully参数,在作者的测试代码中的使用方法,也是k8s中MASQUERADE条目添加的参数。

outro

测试的例子看起来是"Nobody does that, do they?"的错误用法。但是,在真实的环境(不确定,猜测是相同原因)中,同样可能出现这样的现象和问题:尽管直接原因不同,但是底层原理都是因为选择了重复的SNAT,触发丢包,导致延迟。

另外,k8s访问cluster外的IP同样也是用了SNAT的另一种形式MASQUERADE,而这正是分析这部分逻辑的初衷。

相关推荐