DLM: fix race condition between dlm_send and dlm_recv

When kernel_sendpage(in send_to_sock) and kernel_recvmsg (in receive_from_sock) return error, close_connection may works at the same time. At that time, they may wait for each other by cancel_work_sync. Signed-off-by: Tadashi Miyauchi <miayuchi@toshiba-tops.co.jp> Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp> Signed-off-by: David Teigland <teigland@redhat.com>
2017-09-12 08:55:50 +00:00 · 2017-09-12 08:55:50 +00:00 · b2a6662932
parent f0fb83cb92
commit b2a6662932
1 changed files with 6 additions and 2 deletions
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@ -110,6 +110,7 @@ struct connection {
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
 #define CF_APP_LIMITED 7
+#define CF_CLOSING 8
 	struct list_head writequeue;  /* List of outgoing writequeue_entries */
 	spinlock_t writequeue_lock;
 	int (*rx_action) (struct connection *);	/* What to do when active */
@ -581,9 +582,11 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
 static void close_connection(struct connection *con, bool and_other,
 			     bool tx, bool rx)
 {
-	if (tx && cancel_work_sync(&con->swork))
+	bool closing = test_and_set_bit(CF_CLOSING, &con->flags);
+
+	if (tx && !closing && cancel_work_sync(&con->swork))
 		log_print("canceled swork for node %d", con->nodeid);
-	if (rx && cancel_work_sync(&con->rwork))
+	if (rx && !closing && cancel_work_sync(&con->rwork))
 		log_print("canceled rwork for node %d", con->nodeid);

 	mutex_lock(&con->sock_mutex);
@ -603,6 +606,7 @@ static void close_connection(struct connection *con, bool and_other,

 	con->retries = 0;
 	mutex_unlock(&con->sock_mutex);
+	clear_bit(CF_CLOSING, &con->flags);
 }

 /* Data received from remote end */