Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=83f... Commit: 83fa41cb23b81108a6ffc6fe79c2656238a0ffb7 Parent: 1f283367656fdad0ae5fd66c2cd58ec0fc08f9f4 Author: Lon Hohberger lhh@redhat.com AuthorDate: Mon Dec 21 17:54:37 2009 -0500 Committer: Lon Hohberger lhh@redhat.com CommitterDate: Tue Dec 22 10:53:47 2009 -0500
rgmanager: Fix event generation with central_processing
This patch fixes event generation and processing when a node dies. Effectively, what was happening is that when a node failed and was fenced, no events for the dead services on that host were generated. This led to dependent services not restarting correctly in many cases.
Resolves: rhbz#523999
Signed-off-by: Lon Hohberger lhh@redhat.com --- rgmanager/include/resgroup.h | 2 ++ rgmanager/src/daemons/main.c | 1 + rgmanager/src/daemons/rg_forward.c | 4 ++-- rgmanager/src/daemons/rg_state.c | 9 ++++++--- rgmanager/src/daemons/service_op.c | 2 +- rgmanager/src/resources/default_event_script.sl | 8 ++++++++ 6 files changed, 20 insertions(+), 6 deletions(-)
diff --git a/rgmanager/include/resgroup.h b/rgmanager/include/resgroup.h index 7011a0c..5a13fcf 100644 --- a/rgmanager/include/resgroup.h +++ b/rgmanager/include/resgroup.h @@ -180,6 +180,8 @@ int get_rg_state_local(const char *servicename, rg_state_t *svcblk); uint32_t best_target_node(cluster_member_list_t *allowed, uint32_t owner, const char *rg_name, int lock);
+extern int cluster_timeout; + #ifdef DEBUG int _rg_lock(const char *name, struct dlm_lksb *p); int _rg_lock_dbg(const char *, struct dlm_lksb *, const char *, int); diff --git a/rgmanager/src/daemons/main.c b/rgmanager/src/daemons/main.c index 04d9961..883266a 100644 --- a/rgmanager/src/daemons/main.c +++ b/rgmanager/src/daemons/main.c @@ -34,6 +34,7 @@ void flag_shutdown(int sig); int watchdog_init(void);
+int cluster_timeout = 10; int shutdown_pending = 0, running = 1, need_reconfigure = 0; char debug = 0; /* XXX* */ static int signalled = 0; diff --git a/rgmanager/src/daemons/rg_forward.c b/rgmanager/src/daemons/rg_forward.c index bb42922..48649b8 100644 --- a/rgmanager/src/daemons/rg_forward.c +++ b/rgmanager/src/daemons/rg_forward.c @@ -85,7 +85,7 @@ forwarding_thread(void *arg) build_message(&msg, req->rr_request, req->rr_group, req->rr_target, req->rr_arg0, req->rr_arg1);
- if (msg_open(MSG_CLUSTER, rgs.rs_owner, RG_PORT, ctx, 10) < 0) { + if (msg_open(MSG_CLUSTER, rgs.rs_owner, RG_PORT, ctx, 2 * cluster_timeout) < 0) { logt_print(LOG_DEBUG, "FW: Failed to open channel to %d CTX: %p\n", rgs.rs_owner, ctx); goto out_fail; @@ -184,7 +184,7 @@ forwarding_thread_v2(void *arg) strerror(errno)); goto out_fail; } - if (msg_open(MSG_CLUSTER, target, RG_PORT, ctx, 10) < 0) { + if (msg_open(MSG_CLUSTER, target, RG_PORT, ctx, 2 * cluster_timeout) < 0) { logt_print(LOG_DEBUG, "FW: Failed to open channel to %d CTX: %p\n", target, ctx); goto out_fail; diff --git a/rgmanager/src/daemons/rg_state.c b/rgmanager/src/daemons/rg_state.c index 6f80047..029100e 100644 --- a/rgmanager/src/daemons/rg_state.c +++ b/rgmanager/src/daemons/rg_state.c @@ -969,7 +969,7 @@ get_new_owner(const char *svcName) continue;
if (msg_open(MSG_CLUSTER, membership->cml_members[x].cn_nodeid, - RG_PORT, &ctx, 10) < 0) { + RG_PORT, &ctx, 2 * cluster_timeout) < 0) { /* failed to open: better to claim false successful status rather than claim a failure and possibly end up with a service on >1 node */ @@ -1254,7 +1254,10 @@ _svc_stop(const char *svcName, int req, int recover, uint32_t newstate) rg_unlock(&lockp); return RG_EFAIL; } - /* FALLTHROUGH */ + rg_unlock(&lockp); + broadcast_event(svcName, RG_STATE_STOPPED, + -1, svcStatus.rs_last_owner); + return RG_ESUCCESS; case 2: rg_unlock(&lockp); return RG_ESUCCESS; @@ -1553,7 +1556,7 @@ svc_start_remote(const char *svcName, int request, uint32_t target) msg_relo.sm_data.d_svcOwner = target; /* Open a connection to the other node */
- if (msg_open(MSG_CLUSTER, target, RG_PORT, &ctx, 2)< 0) { + if (msg_open(MSG_CLUSTER, target, RG_PORT, &ctx, 2 * cluster_timeout)< 0) { logt_print(LOG_ERR, "#58: Failed opening connection to member #%d\n", target); diff --git a/rgmanager/src/daemons/service_op.c b/rgmanager/src/daemons/service_op.c index a508f1e..112b267 100644 --- a/rgmanager/src/daemons/service_op.c +++ b/rgmanager/src/daemons/service_op.c @@ -142,7 +142,7 @@ service_op_stop(char *svcName, int do_disable, int event_type) } }
- if (msg_open(MSG_CLUSTER, msgtarget, RG_PORT, &ctx, 2)< 0) { + if (msg_open(MSG_CLUSTER, msgtarget, RG_PORT, &ctx, 2 * cluster_timeout)< 0) { logt_print(LOG_ERR, "#58: Failed opening connection to member #%d\n", my_id()); diff --git a/rgmanager/src/resources/default_event_script.sl b/rgmanager/src/resources/default_event_script.sl index 84e6d72..fad22ac 100644 --- a/rgmanager/src/resources/default_event_script.sl +++ b/rgmanager/src/resources/default_event_script.sl @@ -157,6 +157,14 @@ define move_or_start(service, node_list)
(,,, owner, state) = service_status(service); debug("Evaluating ", service, " state=", state, " owner=", owner); + if ((event_type == EVENT_NODE) and (node_id == owner) and + (node_state == NODE_OFFLINE)) { + info("Marking service ", service, " on down member ", + owner, " as stopped"); + if (service_stop(service) < 0) { + return ERR_ABORT; + } + }
len = length(node_list); if (len == 0) {
cluster-commits@lists.fedorahosted.org