From ad63bcff6d549f4ecc91b1e11a5494aab7aa8113 Mon Sep 17 00:00:00 2001
From: Pawel Marczewski <pawel@invisiblethingslab.com>
Date: Thu, 6 Feb 2020 17:23:42 +0100
Subject: [PATCH] daemon: don't listen to clients while reconnecting to agent

Otherwise, connections from client will hang.

See QubesOS/qubes-issues#5347.
---
 daemon/qrexec-daemon.c        | 24 +++++++++++++++++-------
 qrexec/tests/socket/agent.py  | 10 ++--------
 qrexec/tests/socket/daemon.py | 25 +++++++++++++++++++++++--
 qrexec/tests/socket/qrexec.py | 10 +++++++++-
 qrexec/tests/socket/util.py   | 28 ++++++++++++++++++++++++++++
 rpm_spec/qubes-qrexec.spec.in |  1 +
 6 files changed, 80 insertions(+), 18 deletions(-)
 create mode 100644 qrexec/tests/socket/util.py

diff --git a/daemon/qrexec-daemon.c b/daemon/qrexec-daemon.c
index 6a55efb3..4b2060ef 100644
--- a/daemon/qrexec-daemon.c
+++ b/daemon/qrexec-daemon.c
@@ -158,12 +158,16 @@ int create_qrexec_socket(int domid, const char *domname)
     snprintf(link_to_socket_name, sizeof link_to_socket_name,
              "%s/qrexec.%s", socket_dir, domname);
     unlink(link_to_socket_name);
+
+    /* When running as root, make the socket accessible; perms on /var/run/qubes still apply */
+    umask(0);
     if (symlink(socket_address, link_to_socket_name)) {
         fprintf(stderr, "symlink(%s,%s) failed: %s\n", socket_address,
                 link_to_socket_name, strerror (errno));
     }
-    atexit(unlink_qrexec_socket);
-    return get_server_socket(socket_address);
+    int fd = get_server_socket(socket_address);
+    umask(0077);
+    return fd;
 }
 
 #define MAX_STARTUP_TIME_DEFAULT 60
@@ -345,11 +349,10 @@ void init(int xid)
         vchan_port_notify_client[i] = VCHAN_PORT_UNUSED;
     }
 
-    /* When running as root, make the socket accessible; perms on /var/run/qubes still apply */
-    umask(0);
+    atexit(unlink_qrexec_socket);
     qrexec_daemon_unix_socket_fd =
         create_qrexec_socket(xid, remote_domain_name);
-    umask(0077);
+
     signal(SIGPIPE, SIG_IGN);
     signal(SIGCHLD, sigchld_handler);
     signal(SIGUSR1, SIG_DFL);
@@ -909,9 +912,13 @@ static int fill_fdsets_for_select(int vchan_fd, fd_set * read_fdset, fd_set * wr
 /* qrexec-agent has disconnected, cleanup local state and try to connect again.
  * If remote domain dies, terminate qrexec-daemon.
  */
-static int handle_agent_restart(void) {
+static int handle_agent_restart(int xid) {
     size_t i;
 
+    // Stop listening.
+    unlink_qrexec_socket();
+    close(qrexec_daemon_unix_socket_fd);
+
     /* Close old (dead) vchan connection. */
     libvchan_close(vchan);
     vchan = NULL;
@@ -949,6 +956,9 @@ static int handle_agent_restart(void) {
         return -1;
     }
     fprintf(stderr, "qrexec-agent has reconnected\n");
+
+    qrexec_daemon_unix_socket_fd =
+        create_qrexec_socket(xid, remote_domain_name);
     return 0;
 }
 
@@ -1044,7 +1054,7 @@ int main(int argc, char **argv)
 
         if (!libvchan_is_open(vchan)) {
             fprintf(stderr, "qrexec-agent has disconnected\n");
-            if (handle_agent_restart() < 0) {
+            if (handle_agent_restart(remote_domain_id) < 0) {
                 fprintf(stderr, "Failed to reconnect to qrexec-agent, terminating\n");
                 return 1;
             }
diff --git a/qrexec/tests/socket/agent.py b/qrexec/tests/socket/agent.py
index 6fe507f4..81f97298 100644
--- a/qrexec/tests/socket/agent.py
+++ b/qrexec/tests/socket/agent.py
@@ -30,6 +30,7 @@
 
 
 from . import qrexec
+from . import util
 
 
 ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__),
@@ -78,13 +79,6 @@ def wait_for_agent_children(self):
         children = proc.children(recursive=True)
         psutil.wait_procs(children)
 
-    def wait_until(self, func, message, n_tries=10, delay=0.1):
-        for _ in range(n_tries):
-            if func():
-                return
-            time.sleep(delay)
-        self.fail('Timed out waiting: ' + message)
-
     def connect_dom0(self):
         dom0 = qrexec.vchan_client(self.tempdir, self.domain, 0, 512)
         self.addCleanup(dom0.close)
@@ -129,7 +123,7 @@ def test_just_exec(self):
             (qrexec.MSG_DATA_EXIT_CODE, b'\0\0\0\0'),
         ])
 
-        self.wait_until(
+        util.wait_until(
             lambda: os.path.exists(os.path.join(self.tempdir, 'new_file')),
             'file created')
 
diff --git a/qrexec/tests/socket/daemon.py b/qrexec/tests/socket/daemon.py
index 6f552f19..6eee48bb 100644
--- a/qrexec/tests/socket/daemon.py
+++ b/qrexec/tests/socket/daemon.py
@@ -24,11 +24,12 @@
 import tempfile
 import shutil
 import struct
-import psutil
 from typing import Tuple
 
-from . import qrexec
+import psutil
 
+from . import qrexec
+from . import util
 
 ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__),
                                          '..', '..', '..'))
@@ -161,6 +162,26 @@ def test_client_handshake(self):
         client = self.connect_client()
         client.handshake()
 
+    def test_restart_agent(self):
+        agent = self.start_daemon_with_agent()
+        agent.handshake()
+
+        agent.close()
+
+        util.wait_until(
+            lambda: not os.path.exists(
+                os.path.join(self.tempdir, 'qrexec.{}'.format(self.domain))),
+            'socket deleted')
+
+        agent = self.connect_agent()
+        agent.accept()
+        agent.handshake()
+
+        # Now, new client should be able to connect
+        client = self.connect_client()
+        client.handshake()
+
+
     def test_client_cmdline(self):
         agent = self.start_daemon_with_agent()
         agent.handshake()
diff --git a/qrexec/tests/socket/qrexec.py b/qrexec/tests/socket/qrexec.py
index fad38349..38eb9fa7 100644
--- a/qrexec/tests/socket/qrexec.py
+++ b/qrexec/tests/socket/qrexec.py
@@ -132,6 +132,10 @@ def vchan_server(socket_dir, domain, remote_domain, port):
 
 
 def socket_server(socket_path):
+    try:
+        os.unlink(socket_path)
+    except FileNotFoundError:
+        pass
     server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
     server.bind(socket_path)
     server.listen(1)
@@ -151,4 +155,8 @@ def connect_when_ready(conn, path):
             return
 
     # Try for the last time (to propagate the exception)
-    conn.connect(path)
+    try:
+        conn.connect(path)
+    except IOError:
+        conn.close()
+        raise
diff --git a/qrexec/tests/socket/util.py b/qrexec/tests/socket/util.py
new file mode 100644
index 00000000..bfd26ca1
--- /dev/null
+++ b/qrexec/tests/socket/util.py
@@ -0,0 +1,28 @@
+# -*- encoding: utf-8 -*-
+#
+# The Qubes OS Project, http://www.qubes-os.org
+#
+# Copyright (C) 2020  Paweł Marczewski  <pawel@invisiblethingslab.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program; if not, see <http://www.gnu.org/licenses/>.
+
+import time
+
+
+def wait_until(func, message, n_tries=10, delay=0.1):
+    for _ in range(n_tries):
+        if func():
+            return
+        time.sleep(delay)
+    raise Exception('Timed out waiting: ' + message)
diff --git a/rpm_spec/qubes-qrexec.spec.in b/rpm_spec/qubes-qrexec.spec.in
index a501907b..2ef7e5b3 100644
--- a/rpm_spec/qubes-qrexec.spec.in
+++ b/rpm_spec/qubes-qrexec.spec.in
@@ -149,6 +149,7 @@ rm -f %{name}-%{version}
 %{python3_sitelib}/qrexec/tests/socket/agent.py
 %{python3_sitelib}/qrexec/tests/socket/daemon.py
 %{python3_sitelib}/qrexec/tests/socket/qrexec.py
+%{python3_sitelib}/qrexec/tests/socket/util.py
 
 
 %dir %{python3_sitelib}/qrexec/glade