cluster: RHEL56 - Fix potential cluster mirror corruption: 456575/471291 - cluster-commits - Fedora mailing-lists

30 Aug 2010

Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=95d...
Commit:        95d4db6c9a10517a7ed981be2271e5a602e0bcc2
Parent:        8e5bfda2ffd80ebfcdc00cadd503bfcfd5422276
Author:        Jonathan Brassow jbrassow@redhat.com
AuthorDate:    Mon Aug 30 15:54:38 2010 -0500
Committer:     Jonathan Brassow jbrassow@redhat.com
CommitterDate: Mon Aug 30 15:54:38 2010 -0500
Fix potential cluster mirror corruption: 456575/471291
...
From my inline comments:
* If the mirror was successfully recovered, we want to always
* force every machine to write to all devices - otherwise,
* corruption will occur.  Here's how:
*    Node1 suffers a failure and marks a region out-of-sync
*    Node2 attempts a write, gets by is_remote_recovering,
*          and queries the sync status of the region - finding
*          it out-of-sync.
*    Node2 thinks the write should be a nosync write, but it
*          hasn't suffered the drive failure that Node1 has yet.
*          It then issues a generic_make_request directly to
*          the primary image only - which is exactly the device
*          that has suffered the failure.
*    Node2 suffers a lost write - which completely bypasses the
*          mirror layer because it had gone through generic_m_r.
*    The file system will likely explode at this point due to
*    I/O errors.  If it wasn't the primary that failed, it is
*    easily possible in this case to issue writes to just one
*    of the remaining images - also leaving the mirror inconsistent.
*
* We let in_sync() return 1 in a cluster regardless of what is
* in the bitmap once recovery has successfully completed on a
* mirror.  This ensures the mirroring code will continue to
* attempt to write to all mirror images.  The worst that can
* happen for reads is that additional read attempts may be
* taken.
---
 cmirror/src/functions.c |   44 ++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/cmirror/src/functions.c b/cmirror/src/functions.c
index f7270a6..760a6d0 100644
--- a/cmirror/src/functions.c
+++ b/cmirror/src/functions.c
@@ -53,6 +53,7 @@ struct log_c {
time_t delay; /* limits how fast a resume can happen after suspend */
    int touched;
+	int in_sync;  /* An in-sync that stays set until suspend/resume */
    uint32_t region_size;
    uint32_t region_count;
    uint64_t sync_count;
@@ -718,6 +719,7 @@ static int clog_resume(struct clog_tfr *tfr)
    if (!lc)
    	return -EINVAL;
+	lc->in_sync = 0;
    switch (lc->resume_override) {
    case 1000:
    	LOG_ERROR("[%s] Additional resume issued before suspend",
@@ -971,6 +973,41 @@ static int clog_in_sync(struct clog_tfr *tfr)
    	return -EINVAL;
*rtn = log_test_bit(lc->sync_bits, region);
+
+	/*
+	 * If the mirror was successfully recovered, we want to always
+	 * force every machine to write to all devices - otherwise,
+	 * corruption will occur.  Here's how:
+	 *    Node1 suffers a failure and marks a region out-of-sync
+	 *    Node2 attempts a write, gets by is_remote_recovering,
+	 *          and queries the sync status of the region - finding
+	 *            it out-of-sync.
+	 *    Node2 thinks the write should be a nosync write, but it
+	 *          hasn't suffered the drive failure that Node1 has yet.
+	 *          It then issues a generic_make_request directly to
+	 *          the primary image only - which is exactly the device
+	 *          that has suffered the failure.
+	 *    Node2 suffers a lost write - which completely bypasses the
+	 *          mirror layer because it had gone through generic_m_r.
+	 *    The file system will likely explode at this point due to
+	 *    I/O errors.  If it wasn't the primary that failed, it is
+	 *    easily possible in this case to issue writes to just one
+	 *    of the remaining images - also leaving the mirror inconsistent.
+	 *
+	 * We let in_sync() return 1 in a cluster regardless of what is
+	 * in the bitmap once recovery has successfully completed on a
+	 * mirror.  This ensures the mirroring code will continue to
+	 * attempt to write to all mirror images.  The worst that can
+	 * happen for reads is that additional read attempts may be
+	 * taken.
+	 *
+	 * Futher investigation may be required to determine if there are
+	 * similar possible outcomes when the mirror is in the process of
+	 * recovering.  In that case, lc->in_sync would not have been set
+	 * yet.
+	 */
+	if (!*rtn && lc->in_sync)
+		*rtn = 1;
    if (*rtn)
    	LOG_DBG("[%s] Region is in-sync: %llu",
    		SHORT_UUID(lc->uuid), (unsigned long long)region);
@@ -1302,8 +1339,8 @@ static int clog_set_region_sync(struct clog_tfr *tfr)
    			lc->skip_bit_warning = lc->region_count;
if (pkg->region > (lc->skip_bit_warning + 5)) {
-				LOG_ERROR("*** Region #%llu skipped during recovery ***",
-					  (unsigned long long)lc->skip_bit_warning);
+				LOG_SPRINT("*** Region #%llu skipped during recovery ***",
+					   (unsigned long long)lc->skip_bit_warning);
    			lc->skip_bit_warning = lc->region_count;
 #ifdef DEBUG
    			kill(getpid(), SIGUSR1);
@@ -1344,6 +1381,9 @@ static int clog_set_region_sync(struct clog_tfr *tfr)
    		   "(lc->sync_count > lc->region_count) - this is bad",
    		   tfr->seq, SHORT_UUID(lc->uuid), tfr->originator);
+	if (lc->sync_count == lc->region_count)
+		lc->in_sync = 1;
+
    tfr->data_size = 0;
    return 0;
 }