DLM: retry rcom when dlm_wait_function is timed out.
authortsutomu.owa@toshiba.co.jp <tsutomu.owa@toshiba.co.jp>
Tue, 12 Sep 2017 08:56:08 +0000 (08:56 +0000)
committerDavid Teigland <teigland@redhat.com>
Mon, 25 Sep 2017 17:45:21 +0000 (12:45 -0500)
If a node sends a DLM_RCOM_STATUS command and an error occurs on the
receiving side, the DLM_RCOM_STATUS_REPLY response may not be returned.
We retransmitted the DLM_RCOM_STATUS command so that we do not wait for
an infinite response.

Signed-off-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
fs/dlm/rcom.c
fs/dlm/recover.c

index f3f5e72a29ba220542ac0c630fc99cd49664cdf2..4ff061de927e1a8e53f2c38cfb8cd41baced274c 100644 (file)
@@ -155,6 +155,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
                goto out;
        }
 
+retry:
        error = create_rcom(ls, nodeid, DLM_RCOM_STATUS,
                            sizeof(struct rcom_status), &rc, &mh);
        if (error)
@@ -169,6 +170,8 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
 
        error = dlm_wait_function(ls, &rcom_response);
        disallow_sync_reply(ls);
+       if (error == -ETIMEDOUT)
+               goto retry;
        if (error)
                goto out;
 
@@ -276,6 +279,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
 
        ls->ls_recover_nodeid = nodeid;
 
+retry:
        error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
        if (error)
                goto out;
@@ -288,6 +292,8 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
 
        error = dlm_wait_function(ls, &rcom_response);
        disallow_sync_reply(ls);
+       if (error == -ETIMEDOUT)
+               goto retry;
  out:
        return error;
 }
index eaea789bf97d0dd862545c202156484a532eedbb..ce2aa54ca2e2484f35b11d2cc12614b080656465 100644 (file)
@@ -52,6 +52,10 @@ int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
                                        dlm_config.ci_recover_timer * HZ);
                if (rv)
                        break;
+               if (test_bit(LSFL_RCOM_WAIT, &ls->ls_flags)) {
+                       log_debug(ls, "dlm_wait_function timed out");
+                       return -ETIMEDOUT;
+               }
        }
 
        if (dlm_recovery_stopped(ls)) {