lvmlockd: unlock lv if command fails before lock completes

If lvmlockd acquires an lv lock for a command, but the
command exits before the reply, then the command has
not activated the lv and lvmlockd should unlock it.
This only applies when the lv was not already locked.

(There will always be a chance that the lv lock is held
while the lv is not active, i.e. if the command fails in
the small window between getting the lv lock and before
doing the activation.  In that case, rerunning the
activation command corrects the inconsistency.)

This commit helps by automatically clearing the
inconsistency (lv locked by not activated) in the most
common case when the lv lock operation is slow to
complete and the command is canceled by the user.

This commit also adds and cleans up references to the
client id in a bunch of log messages, which is useful
to follow processing on each independent lock request.
diff --git a/daemons/lvmlockd/lvmlockd-core.c b/daemons/lvmlockd/lvmlockd-core.c
index 7d5b158..487b6ec 100644
--- a/daemons/lvmlockd/lvmlockd-core.c
+++ b/daemons/lvmlockd/lvmlockd-core.c
@@ -1031,10 +1031,12 @@
 
 	memset(&vb, 0, sizeof(vb));
 
+	r->last_client_id = act->client_id;
+
 	if (r->type == LD_RT_LV)
-		log_debug("S %s R %s res_lock mode %s (%s)", ls->name, r->name, mode_str(act->mode), act->lv_name);
+		log_debug("S %s R %s res_lock cl %u mode %s (%s)", ls->name, r->name, act->client_id, mode_str(act->mode), act->lv_name);
 	else
-		log_debug("S %s R %s res_lock mode %s", ls->name, r->name, mode_str(act->mode));
+		log_debug("S %s R %s res_lock cl %u mode %s", ls->name, r->name, act->client_id, mode_str(act->mode));
 
 	if (r->mode == LD_LK_SH && act->mode == LD_LK_SH)
 		goto add_lk;
@@ -1045,10 +1047,10 @@
 	rv = lm_lock(ls, r, act->mode, act, &vb, retry, act->flags & LD_AF_ADOPT);
 
 	if (r->use_vb)
-		log_debug("S %s R %s res_lock %d read vb %x %x %u",
+		log_debug("S %s R %s res_lock rv %d read vb %x %x %u",
 			  ls->name, r->name, rv, vb.version, vb.flags, vb.r_version);
 	else
-		log_debug("S %s R %s res_lock %d", ls->name, r->name, rv);
+		log_debug("S %s R %s res_lock rv %d", ls->name, r->name, rv);
 
 	if (rv < 0)
 		return rv;
@@ -1298,6 +1300,26 @@
 		lk->client_id = 0;
 	}
 
+	/*
+	 * LV_LOCK means the action acquired the lv lock in the lock manager
+	 * (as opposed to finding that the lv lock was already held).  If
+	 * the client for this LV_LOCK action fails before we send the result,
+	 * then we automatically unlock the lv since the lv wasn't activated.
+	 * (There will always be an odd chance the lv lock is held while the
+	 * lv is not active, but this helps.)  The most common case where this
+	 * is helpful is when the lv lock operation is slow/delayed and the
+	 * command is canceled by the user.
+	 *
+	 * LV_UNLOCK means the lv unlock action was generated by lvmlockd when
+	 * it tried to send the reply for an lv lock action (with LV_LOCK set),
+	 * and failed to send the reply to the client/command.  The
+	 * last_client_id saved on the resource is compared to this LV_UNLOCK
+	 * action before the auto unlock is done in case another action locked
+	 * the lv between the failed client lock action and the auto unlock.
+	 */
+	if (r->type == LD_RT_LV)
+		act->flags |= LD_AF_LV_LOCK;
+
 	list_add_tail(&lk->list, &r->locks);
 
 	return rv;
@@ -1309,7 +1331,9 @@
 	uint32_t r_version;
 	int rv;
 
-	log_debug("S %s R %s res_convert mode %d", ls->name, r->name, act->mode);
+	r->last_client_id = act->client_id;
+
+	log_debug("S %s R %s res_convert cl %u mode %d", ls->name, r->name, act->client_id, act->mode);
 
 	if (act->mode == LD_LK_EX && lk->mode == LD_LK_SH && r->sh_count > 1)
 		return -EAGAIN;
@@ -1387,7 +1411,7 @@
 	return -ENOENT;
 
 do_cancel:
-	log_debug("S %s R %s res_cancel client %d", ls->name, r->name, cact->client_id);
+	log_debug("S %s R %s res_cancel cl %u", ls->name, r->name, cact->client_id);
 	cact->result = -ECANCELED;
 	list_del(&cact->list);
 	add_client_result(cact);
@@ -1435,22 +1459,32 @@
 	}
 
 	if (act->op != LD_OP_CLOSE)
-		log_debug("S %s R %s res_unlock no locks", ls->name, r->name);
+		log_debug("S %s R %s res_unlock cl %u no locks", ls->name, r->name, act->client_id);
 	return -ENOENT;
 
 do_unlock:
+	if ((act->flags & LD_AF_LV_UNLOCK) && (r->last_client_id != act->client_id)) {
+		log_debug("S %s R %s res_unlock cl %u for failed client ignored, last client %u",
+			  ls->name, r->name, act->client_id, r->last_client_id);
+		return -ENOENT;
+	}
+
+	r->last_client_id = act->client_id;
+
 	if (act->op == LD_OP_CLOSE)
-		log_debug("S %s R %s res_unlock from close", ls->name, r->name);
+		log_debug("S %s R %s res_unlock cl %u from close", ls->name, r->name, act->client_id);
 	else if (r->type == LD_RT_LV)
-		log_debug("S %s R %s res_unlock (%s)", ls->name, r->name, act->lv_name);
+		log_debug("S %s R %s res_unlock cl %u (%s)", ls->name, r->name, act->client_id, act->lv_name);
 	else
-		log_debug("S %s R %s res_unlock", ls->name, r->name);
+		log_debug("S %s R %s res_unlock cl %u", ls->name, r->name, act->client_id);
 
 	/* send unlock to lm when last sh lock is unlocked */
 	if (lk->mode == LD_LK_SH) {
 		r->sh_count--;
-		if (r->sh_count > 0)
+		if (r->sh_count > 0) {
+			log_debug("S %s R %s res_unlock sh_count %u", ls->name, r->name, r->sh_count);
 			goto rem_lk;
+		}
 	}
 
 	if ((r->type == LD_RT_GL) && (r->mode == LD_LK_EX)) {
@@ -1498,14 +1532,14 @@
 
 	lk = find_lock_client(r, act->client_id);
 	if (!lk) {
-		log_error("S %s R %s res_update client %u lock not found",
+		log_error("S %s R %s res_update cl %u lock not found",
 			  ls->name, r->name, act->client_id);
 		return -ENOENT;
 	}
 
 	if (r->mode != LD_LK_EX) {
-		log_error("S %s R %s res_update version on non-ex lock",
-			  ls->name, r->name);
+		log_error("S %s R %s res_update cl %u version on non-ex lock",
+			  ls->name, r->name, act->client_id);
 		return -EINVAL;
 	}
 
@@ -1516,7 +1550,7 @@
 	else
 		lk->version = act->version;
 
-	log_debug("S %s R %s res_update lk version to %u", ls->name, r->name, lk->version);
+	log_debug("S %s R %s res_update cl %u lk version to %u", ls->name, r->name, act->client_id, lk->version);
 
 	return 0;
 }
@@ -1716,6 +1750,7 @@
 			continue;
 		} else {
 			/* success */
+			r->last_client_id = act->client_id;
 			act->result = -EALREADY;
 			list_del(&act->list);
 			add_client_result(act);
@@ -1750,6 +1785,7 @@
 			continue;
 		} else {
 			/* success */
+			r->last_client_id = act->client_id;
 			act->result = -EALREADY;
 			list_del(&act->list);
 			add_client_result(act);
@@ -1787,6 +1823,7 @@
 
 		if ((lk->mode == LD_LK_EX) ||
 		    (lk->mode == LD_LK_SH && act->mode == LD_LK_SH)) {
+			r->last_client_id = act->client_id;
 			act->result = 0;
 			list_del(&act->list);
 			add_client_result(act);
@@ -1794,6 +1831,7 @@
 			/* persistent lock is sh, transient request is ex */
 			/* FIXME: can we remove this case? do a convert here? */
 			log_debug("res_process %s existing persistent lock new transient", r->name);
+			r->last_client_id = act->client_id;
 			act->result = -EEXIST;
 			list_del(&act->list);
 			add_client_result(act);
@@ -1828,10 +1866,12 @@
 		if (lk->mode != act->mode) {
 			/* FIXME: convert and change to persistent? */
 			log_debug("res_process %s existing transient lock new persistent", r->name);
+			r->last_client_id = act->client_id;
 			act->result = -EEXIST;
 			list_del(&act->list);
 			add_client_result(act);
 		} else {
+			r->last_client_id = act->client_id;
 			lk->flags |= LD_LF_PERSISTENT;
 			lk->client_id = 0;
 			act->result = 0;
@@ -1880,7 +1920,6 @@
 	 * lv lock conflicts won't be transient so don't retry them.
 	 */
 
-
 	if (r->mode == LD_LK_EX)
 		return;
 
@@ -3491,14 +3530,14 @@
 
 	if (!cl->poll_ignore || cl->fd == -1 || cl->pi == -1) {
 		/* shouldn't happen */
-		log_error("client_resume %d bad state ig %d fd %d pi %d",
+		log_error("client_resume %u bad state ig %d fd %d pi %d",
 			  cl->id, cl->poll_ignore, cl->fd, cl->pi);
 		return;
 	}
 
 	pthread_mutex_lock(&pollfd_mutex);
 	if (pollfd[cl->pi].fd != POLL_FD_IGNORE) {
-		log_error("client_resume %d pi %d fd %d not IGNORE",
+		log_error("client_resume %u pi %d fd %d not IGNORE",
 			  cl->id, cl->pi, cl->fd);
 	}
 	pollfd[cl->pi].fd = cl->fd;
@@ -3509,16 +3548,17 @@
 }
 
 /* called from client_thread, cl->mutex is held */
-static void client_send_result(struct client *cl, struct action *act)
+static int client_send_result(struct client *cl, struct action *act)
 {
 	response res;
 	char result_flags[128];
 	int dump_len = 0;
 	int dump_fd = -1;
+	int rv = 0;
 
 	if (cl->dead) {
-		log_debug("client send %d skip dead", cl->id);
-		return;
+		log_debug("send cl %u skip dead", cl->id);
+		return -1;
 	}
 
 	memset(result_flags, 0, sizeof(result_flags));
@@ -3580,7 +3620,7 @@
 		if (act->lv_args[0])
 			lv_args = act->lv_args;
 
-		log_debug("send %s[%d.%u] %s %s rv %d vg_args %s lv_args %s",
+		log_debug("send %s[%d] cl %u %s %s rv %d vg_args %s lv_args %s",
 			  cl->name[0] ? cl->name : "client", cl->pid, cl->id,
 			  op_str(act->op), rt_str(act->rt),
 			  act->result, vg_args ? vg_args : "", lv_args ? lv_args : "");
@@ -3612,7 +3652,7 @@
 		else
 			act->result = -EINVAL;
 
-		log_debug("send %s[%d.%u] dump result %d dump_len %d",
+		log_debug("send %s[%d] cl %u dump result %d dump_len %d",
 			  cl->name[0] ? cl->name : "client", cl->pid, cl->id,
 			  act->result, dump_len);
 
@@ -3625,7 +3665,7 @@
 		 * A normal reply.
 		 */
 
-		log_debug("send %s[%d.%u] %s %s rv %d %s %s",
+		log_debug("send %s[%d] cl %u %s %s rv %d %s %s",
 			  cl->name[0] ? cl->name : "client", cl->pid, cl->id,
 			  op_str(act->op), rt_str(act->rt),
 			  act->result, (act->result == -ENOLS) ? "ENOLS" : "", result_flags);
@@ -3639,7 +3679,13 @@
 					  NULL);
 	}
 
-	buffer_write(cl->fd, &res.buffer);
+	if (!buffer_write(cl->fd, &res.buffer)) {
+		rv = -errno;
+		if (rv >= 0)
+			rv = -1;
+		log_debug("send cl %u fd %d error %d", cl->id, cl->fd, rv);
+	}
+
 	buffer_destroy(&res.buffer);
 
 	client_resume(cl);
@@ -3649,6 +3695,8 @@
 		send_dump_buf(dump_fd, dump_len);
 		close(dump_fd);
 	}
+
+	return rv;
 }
 
 /* called from client_thread */
@@ -4303,10 +4351,10 @@
 	rv = buffer_read(cl->fd, &req.buffer);
 	if (!rv) {
 		if (errno == ECONNRESET) {
-			log_debug("client recv %d ECONNRESET", cl->id);
+			log_debug("client recv %u ECONNRESET", cl->id);
 			cl->dead = 1;
 		} else {
-			log_error("client recv %d buffer_read error %d", cl->id, errno);
+			log_error("client recv %u buffer_read error %d", cl->id, errno);
 		}
 		buffer_destroy(&req.buffer);
 		client_resume(cl);
@@ -4315,7 +4363,7 @@
 
 	req.cft = dm_config_from_string(req.buffer.mem);
 	if (!req.cft) {
-		log_error("client recv %d config_from_string error", cl->id);
+		log_error("client recv %u config_from_string error", cl->id);
 		buffer_destroy(&req.buffer);
 		client_resume(cl);
 		return;
@@ -4324,7 +4372,7 @@
 	str = daemon_request_str(req, "request", NULL);
 	rv = str_to_op_rt(str, &op, &rt);
 	if (rv < 0) {
-		log_error("client recv %d bad request name \"%s\"", cl->id, str ? str : "");
+		log_error("client recv %u bad request name \"%s\"", cl->id, str ? str : "");
 		dm_config_destroy(req.cft);
 		buffer_destroy(&req.buffer);
 		client_resume(cl);
@@ -4447,7 +4495,7 @@
 	dm_config_destroy(req.cft);
 	buffer_destroy(&req.buffer);
 
-	log_debug("recv %s[%d.%u] %s %s \"%s\" mode %s flags %x",
+	log_debug("recv %s[%d] cl %u %s %s \"%s\" mode %s flags %x",
 		  cl->name[0] ? cl->name : "client", cl->pid, cl->id,
 		  op_str(act->op), rt_str(act->rt), act->vg_name, mode_str(act->mode), opts);
 
@@ -4514,6 +4562,8 @@
 {
 	struct client *cl;
 	struct action *act;
+	struct action *act_un;
+	int rv;
 
 	while (1) {
 		pthread_mutex_lock(&client_mutex);
@@ -4537,11 +4587,30 @@
 
 			if (cl) {
 				pthread_mutex_lock(&cl->mutex);
-				client_send_result(cl, act);
+				rv = client_send_result(cl, act);
 				pthread_mutex_unlock(&cl->mutex);
 			} else {
-				log_debug("no client for result");
+				log_debug("no client %u for result", act->client_id);
+				rv = -1;
 			}
+
+			/*
+			 * The client failed after we acquired an LV lock for
+			 * it, but before getting this reply saying it's done.
+			 * So the lv will not be active and we should release
+			 * the lv lock it requested.
+			 */
+			if ((rv < 0) && (act->flags & LD_AF_LV_LOCK)) {
+				log_debug("auto unlock lv for failed client %u", act->client_id);
+				if ((act_un = alloc_action())) {
+					memcpy(act_un, act, sizeof(struct action));
+					act_un->mode = LD_LK_UN;
+					act_un->flags |= LD_AF_LV_UNLOCK;
+					act_un->flags &= ~LD_AF_LV_LOCK;
+					add_lock_action(act_un);
+				}
+			}
+
 			free_action(act);
 			continue;
 		}
@@ -4571,6 +4640,7 @@
 				log_debug("client rem %d pi %d fd %d ig %d",
 					  cl->id, cl->pi, cl->fd, cl->poll_ignore);
 				*/
+
 				/*
 				 * If cl->dead was set in main_loop, then the
 				 * fd has already been closed and the pollfd
@@ -5620,7 +5690,7 @@
 	list_add_tail(&cl->list, &client_list);
 	pthread_mutex_unlock(&client_mutex);
 
-	log_debug("client add id %d pi %d fd %d", cl->id, cl->pi, cl->fd);
+	log_debug("new cl %u pi %d fd %d", cl->id, cl->pi, cl->fd);
 }
 
 /*
@@ -5763,14 +5833,14 @@
 
 				if (cl->recv) {
 					/* should not happen */
-					log_error("main client %d already recv", cl->id);
+					log_error("main client %u already recv", cl->id);
 
 				} else if (cl->dead) {
 					/* should not happen */
-					log_error("main client %d already dead", cl->id);
+					log_error("main client %u already dead", cl->id);
 
 				} else if (is_dead) {
-					log_debug("close %s[%d.%u] fd %d",
+					log_debug("close %s[%d] cl %u fd %d",
 						  cl->name[0] ? cl->name : "client",
 						  cl->pid, cl->id, cl->fd);
 					cl->dead = 1;
diff --git a/daemons/lvmlockd/lvmlockd-internal.h b/daemons/lvmlockd/lvmlockd-internal.h
index a90dd99..3afa75a 100644
--- a/daemons/lvmlockd/lvmlockd-internal.h
+++ b/daemons/lvmlockd/lvmlockd-internal.h
@@ -103,6 +103,8 @@
 #define LD_AF_DUP_GL_LS            0x00002000
 #define LD_AF_ADOPT                0x00010000
 #define LD_AF_WARN_GL_REMOVED	   0x00020000
+#define LD_AF_LV_LOCK              0x00040000
+#define LD_AF_LV_UNLOCK            0x00080000
 
 /*
  * Number of times to repeat a lock request after
@@ -141,6 +143,7 @@
 	int8_t mode;
 	unsigned int sh_count;		/* number of sh locks on locks list */
 	uint32_t version;
+	uint32_t last_client_id;	/* last client_id to lock or unlock resource */
 	unsigned int lm_init : 1;	/* lm_data is initialized */
 	unsigned int adopt : 1;		/* temp flag in remove_inactive_lvs */
 	unsigned int version_zero_valid : 1;