platform: cope differently with spurious RTM_DELLINK message when unslaving bridge-slave

Unslaving from a bridge causes a wrong RTM_DELLINK event for
the former slave.

    # ip link add dummy0 type dummy
    # ip link add bridge0 type bridge
    # ip link set bridge0 up
    # ip link set dummy0 master bridge0
    # ip monitor link &
    # ip link set dummy0 nomaster
    18: dummy0: <BROADCAST,NOARP> mtu 1500 qdisc noop master bridge0 state DOWN group default
        link/ether 76:44:5f:b9:38:02 brd ff:ff:ff:ff:ff:ff
    18: dummy0: <BROADCAST,NOARP> mtu 1500 master bridge0 state DOWN
        link/ether 76:44:5f:b9:38:02
    Deleted 18: dummy0: <BROADCAST,NOARP> mtu 1500 master bridge0 state DOWN
        link/ether 76:44:5f:b9:38:02
    18: dummy0: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN group default
        link/ether 76:44:5f:b9:38:02 brd ff:ff:ff:ff:ff:ff
    19: bridge0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc noqueue state DOWN group default
        link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff
    19: bridge0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc noqueue state DOWN group default
        link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff

Previously, during do_request_link() we would remember the link that is
about to be requested (delayed_deletion) and delay processing a new
RTM_DELLINK message until the end of do_request_link() -- and possibly
forget about about the deletion, if RTM_DELLINK was followed by a
RTM_NEWLINK.

However, this hack does not catch the case where an external command
unslaves the link.

Instead just accept the wrong event and raise a "removed" signal right
away. This brings the cache in an externally visible, wrong state that
will be fixed by a following "added" signal.

Still do that because working around the kernel bug is complicated. Also,
we already might emit wrong "added" signals for devices that are already
removed. As a consequence, a user should not consider the platform signals
until all events are processed.
Listeners to that signal should accept that added/removed link changes
can be wrong and should preferably handle them idly, when the events
have settled.

It can even be worse, that a RTM_DELLINK is not fixed by a following
RTM_NEWLINK:

    ...
    # ip link set dummy0 nomaster
    36: dummy0: <BROADCAST,NOARP> mtu 1500 qdisc noop master bridge0 state DOWN
        link/ether e2:f2:20:98:3a:be brd ff:ff:ff:ff:ff:ff
    36: dummy0: <BROADCAST,NOARP> mtu 1500 master bridge0 state DOWN
        link/ether e2:f2:20:98:3a:be
    Deleted 36: dummy0: <BROADCAST,NOARP> mtu 1500 master bridge0 state DOWN
        link/ether e2:f2:20:98:3a:be
    37: bridge0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc noqueue state DOWN
        link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff
    37: bridge0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc noqueue state DOWN
        link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff

So, when a slave is deleted, we have to refetch it too.

https://bugzilla.redhat.com/show_bug.cgi?id=1285719
This commit is contained in:
Thomas Haller
2015-11-27 11:20:58 +01:00
parent 83240f24ae
commit 8a87a91813
2 changed files with 78 additions and 64 deletions

View File

@@ -2200,7 +2200,6 @@ struct _NMLinuxPlatformPrivate {
} delayed_action;
GHashTable *prune_candidates;
GHashTable *delayed_deletion;
GHashTable *wifi_data;
};
@@ -2598,39 +2597,6 @@ cache_prune_candidates_prune (NMPlatform *platform)
g_hash_table_unref (prune_candidates);
}
static void
cache_delayed_deletion_prune (NMPlatform *platform)
{
NMLinuxPlatformPrivate *priv = NM_LINUX_PLATFORM_GET_PRIVATE (platform);
GPtrArray *prune_list = NULL;
GHashTableIter iter;
guint i;
NMPObject *obj;
if (g_hash_table_size (priv->delayed_deletion) == 0)
return;
g_hash_table_iter_init (&iter, priv->delayed_deletion);
while (g_hash_table_iter_next (&iter, NULL, (gpointer *) &obj)) {
if (obj) {
if (!prune_list)
prune_list = g_ptr_array_new_full (g_hash_table_size (priv->delayed_deletion), (GDestroyNotify) nmp_object_unref);
g_ptr_array_add (prune_list, nmp_object_ref (obj));
}
}
g_hash_table_remove_all (priv->delayed_deletion);
if (prune_list) {
for (i = 0; i < prune_list->len; i++) {
obj = prune_list->pdata[i];
_LOGt ("delayed-deletion: delete %s", nmp_object_to_string (obj, NMP_OBJECT_TO_STRING_ID, NULL, 0));
cache_remove_netlink (platform, obj, NULL, NULL, NM_PLATFORM_REASON_EXTERNAL);
}
g_ptr_array_unref (prune_list);
}
}
static void
cache_pre_hook (NMPCache *cache, const NMPObject *old, const NMPObject *new, NMPCacheOpsType ops_type, gpointer user_data)
{
@@ -2805,7 +2771,22 @@ cache_pre_hook (NMPCache *cache, const NMPObject *old, const NMPObject *new, NMP
if (ifindex2 > 0 && ifindex1 != ifindex2)
delayed_action_schedule (platform, DELAYED_ACTION_TYPE_REFRESH_LINK, GINT_TO_POINTER (ifindex2));
}
}
{
if ( ( (ops_type == NMP_CACHE_OPS_REMOVED)
|| ( (ops_type == NMP_CACHE_OPS_UPDATED)
&& new
&& !new->_link.netlink.is_in_netlink))
&& old
&& old->_link.netlink.is_in_netlink
&& old->link.master) {
/* sometimes we receive a wrong RTM_DELLINK message when unslaving
* a device. Refetch the link again to check whether the device
* is really gone.
*
* https://bugzilla.redhat.com/show_bug.cgi?id=1285719#c2 */
delayed_action_schedule (platform, DELAYED_ACTION_TYPE_REFRESH_LINK, GINT_TO_POINTER (old->link.ifindex));
}
}
break;
case NMP_OBJECT_TYPE_IP4_ADDRESS:
@@ -2899,13 +2880,8 @@ do_request_link (NMPlatform *platform, int ifindex, const char *name, gboolean h
_LOGD ("do-request-link: %d %s", ifindex, name ? name : "");
if (ifindex > 0) {
NMPObject *obj;
cache_prune_candidates_record_one (platform,
(NMPObject *) nmp_cache_lookup_link (priv->cache, ifindex));
obj = nmp_object_new_link (ifindex);
_LOGt ("delayed-deletion: protect object %s", nmp_object_to_string (obj, NMP_OBJECT_TO_STRING_ID, NULL, 0));
g_hash_table_insert (priv->delayed_deletion, obj, NULL);
}
event_handler_read_netlink_all (platform, FALSE);
@@ -2926,7 +2902,6 @@ do_request_link (NMPlatform *platform, int ifindex, const char *name, gboolean h
event_handler_read_netlink_all (platform, TRUE);
cache_delayed_deletion_prune (platform);
cache_prune_candidates_prune (platform);
if (handle_delayed_action)
@@ -3084,30 +3059,12 @@ event_notification (struct nl_msg *msg, gpointer user_data)
switch (msghdr->nlmsg_type) {
case RTM_NEWLINK:
if (NMP_OBJECT_GET_TYPE (obj) == NMP_OBJECT_TYPE_LINK) {
if (g_hash_table_lookup (priv->delayed_deletion, obj) != NULL) {
/* the object is scheduled for delayed deletion. Replace that object
* by clearing the value from priv->delayed_deletion. */
_LOGt ("delayed-deletion: clear delayed deletion of protected object %s", nmp_object_to_string (obj, NMP_OBJECT_TO_STRING_ID, NULL, 0));
g_hash_table_insert (priv->delayed_deletion, nmp_object_ref (obj), NULL);
}
}
/* fall-through */
case RTM_NEWADDR:
case RTM_NEWROUTE:
cache_update_netlink (platform, obj, &obj_cache, NULL, NM_PLATFORM_REASON_EXTERNAL);
break;
case RTM_DELLINK:
if ( NMP_OBJECT_GET_TYPE (obj) == NMP_OBJECT_TYPE_LINK
&& g_hash_table_contains (priv->delayed_deletion, obj)) {
/* We sometimes receive spurious RTM_DELLINK events. In this case, we want to delay
* the deletion of the object until later. */
_LOGt ("delayed-deletion: delay deletion of protected object %s", nmp_object_to_string (obj, NMP_OBJECT_TO_STRING_ID, NULL, 0));
g_hash_table_insert (priv->delayed_deletion, nmp_object_ref (obj), nmp_object_ref (obj));
break;
}
/* fall-through */
case RTM_DELADDR:
case RTM_DELROUTE:
cache_remove_netlink (platform, obj, &obj_cache, NULL, NM_PLATFORM_REASON_EXTERNAL);
@@ -5311,10 +5268,6 @@ nm_linux_platform_init (NMLinuxPlatform *self)
self->priv = priv;
priv->delayed_deletion = g_hash_table_new_full ((GHashFunc) nmp_object_id_hash,
(GEqualFunc) nmp_object_id_equal,
(GDestroyNotify) nmp_object_unref,
(GDestroyNotify) nmp_object_unref);
priv->cache = nmp_cache_new ();
priv->delayed_action.list_master_connected = g_ptr_array_new ();
priv->delayed_action.list_refresh_link = g_ptr_array_new ();
@@ -5417,7 +5370,6 @@ dispose (GObject *object)
nm_clear_g_source (&priv->delayed_action.idle_id);
g_clear_pointer (&priv->prune_candidates, g_hash_table_unref);
g_clear_pointer (&priv->delayed_deletion, g_hash_table_unref);
G_OBJECT_CLASS (nm_linux_platform_parent_class)->dispose (object);
}

View File

@@ -289,10 +289,14 @@ test_slave (int master, int type, SignalData *master_changed)
}
/* Release */
ensure_no_signal (link_added);
ensure_no_signal (link_changed);
ensure_no_signal (link_removed);
g_assert (nm_platform_link_release (NM_PLATFORM_GET, master, ifindex));
g_assert_cmpint (nm_platform_link_get_master (NM_PLATFORM_GET, ifindex), ==, 0);
accept_signals (link_added, 0, 1);
accept_signals (link_changed, 1, 3);
accept_signals (link_removed, 0, 1);
accept_signals (master_changed, 1, 2);
ensure_no_signal (master_changed);
@@ -304,7 +308,9 @@ test_slave (int master, int type, SignalData *master_changed)
ensure_no_signal (master_changed);
/* Remove */
ensure_no_signal (link_added);
ensure_no_signal (link_changed);
ensure_no_signal (link_removed);
g_assert (nm_platform_link_delete (NM_PLATFORM_GET, ifindex));
accept_signals (master_changed, 0, 1);
accept_signals (link_changed, 0, 1);
@@ -1493,6 +1499,61 @@ again:
/*****************************************************************************/
static void
test_nl_bugs_spuroius_dellink (void)
{
const char *IFACE_BRIDGE0 = "nm-test-bridge0";
const char *IFACE_DUMMY0 = "nm-test-dummy0";
int ifindex_bridge0, ifindex_dummy0;
const NMPlatformLink *pllink;
gboolean wait_for_settle;
/* see https://bugzilla.redhat.com/show_bug.cgi?id=1285719 */
nmtstp_run_command_check ("ip link add %s type dummy", IFACE_DUMMY0);
ifindex_dummy0 = nmtstp_assert_wait_for_link (IFACE_DUMMY0, NM_LINK_TYPE_DUMMY, 100)->ifindex;
nmtstp_run_command_check ("ip link add %s type bridge", IFACE_BRIDGE0);
ifindex_bridge0 = nmtstp_assert_wait_for_link (IFACE_BRIDGE0, NM_LINK_TYPE_BRIDGE, 100)->ifindex;
nmtstp_link_set_updown (-1, ifindex_bridge0, TRUE);
nmtstp_run_command_check ("ip link set %s master %s", IFACE_DUMMY0, IFACE_BRIDGE0);
NMTST_WAIT_ASSERT (100, {
nmtstp_wait_for_signal (50);
pllink = nm_platform_link_get (NM_PLATFORM_GET, ifindex_dummy0);
g_assert (pllink);
if (pllink->master == ifindex_bridge0)
break;
});
nm_platform_process_events (NM_PLATFORM_GET);
nmtstp_run_command_check ("ip link set %s nomaster", IFACE_DUMMY0);
wait_for_settle = TRUE;
nmtstp_wait_for_signal (50);
again:
nm_platform_process_events (NM_PLATFORM_GET);
pllink = nm_platform_link_get (NM_PLATFORM_GET, ifindex_bridge0);
g_assert (pllink);
pllink = nm_platform_link_get (NM_PLATFORM_GET, ifindex_dummy0);
g_assert (pllink);
g_assert_cmpint (pllink->parent, ==, 0);
if (wait_for_settle) {
wait_for_settle = FALSE;
NMTST_WAIT (300, { nmtstp_wait_for_signal (50); });
goto again;
}
nm_platform_link_delete (NM_PLATFORM_GET, ifindex_bridge0);
nm_platform_link_delete (NM_PLATFORM_GET, ifindex_dummy0);
}
/*****************************************************************************/
void
init_tests (int *argc, char ***argv)
{
@@ -1531,5 +1592,6 @@ setup_tests (void)
g_test_add_func ("/link/nl-bugs/veth", test_nl_bugs_veth);
g_test_add_func ("/link/nl-bugs/spurious-newlink", test_nl_bugs_spuroius_newlink);
g_test_add_func ("/link/nl-bugs/spurious-dellink", test_nl_bugs_spuroius_dellink);
}
}