Increase potential max workers and allow connection pool to increase to match.

rtibbles · rtibbles · commit a16095eba071 · 2025-09-02T15:39:30.000-07:00
diff --git a/kolibri/core/content/utils/resource_import.py b/kolibri/core/content/utils/resource_import.py
@@ -282,6 +282,10 @@ def run_import(self):
             # Allow for two open file descriptors per download:
             # The temporary download file that the file is streamed to initially, and then
             # the actual destination file that it is moved to.
+            # Note that with the possibility of a chunked file download,
+            # the true number of file descriptors used may be higher,
+            # but this is unlikely to be a problem in practice, and we build in extra tolerance
+            # in the fd_safe_executor max worker calculation.
             with fd_safe_executor(fds_per_task=2) as executor:
                 self.executor = executor
                 batch_size = 100
@@ -392,8 +396,27 @@ def __init__(
         )
 
         self.session = requests.Session()
+        # Because we create the executor in the run method, we need to track
+        # we need to mount the adapter in the create_file_transfer method
+        # so that we can introspect the executor to configure the pool correctly.
+        self._adapter_mounted = False
+
+    def _mount_adapter(self):
+        if not self._adapter_mounted:
+            # If we are using a ThreadPoolExecutor, then we need to make sure
+            # that the requests session has enough connections to handle
+            # the number of threads.
+            max_workers = self.executor._max_workers
+            adapter = requests.adapters.HTTPAdapter(
+                pool_connections=max_workers,
+                pool_maxsize=max_workers,
+            )
+            self.session.mount("http://", adapter)
+            self.session.mount("https://", adapter)
+            self._adapter_mounted = True
 
     def create_file_transfer(self, f, filename, dest):
+        self._mount_adapter()
         url = paths.get_content_storage_remote_url(filename, baseurl=self.baseurl)
         return transfer.FileDownload(
             url,
diff --git a/kolibri/core/tasks/utils.py b/kolibri/core/tasks/utils.py
@@ -358,12 +358,16 @@ def fd_safe_executor(fds_per_task=2):
         else concurrent.futures.ThreadPoolExecutor
     )
 
-    max_workers = 10
+    max_workers = 50
 
-    if not use_multiprocessing:
-        # If we're not using multiprocessing for workers, we may need
-        # to limit the number of workers depending on the number of allowed
-        # file descriptors.
+    # We may need to limit the number of workers depending
+    # on the number of allowed file descriptors.
+
+    if conf.OPTIONS["Tasks"]["USE_WORKER_MULTIPROCESSING"]:
+        # If we are using multiprocessing, then file descriptors are not shared.
+        # So we can use all the available file descriptors for this task.
+        max_descriptors_per_task = get_fd_limit()
+    else:
         # This is a heuristic method, where we know there can be issues if
         # the max number of file descriptors for a process is 256, and we use 10
         # workers, with potentially 4 concurrent tasks downloading files.
@@ -376,12 +380,11 @@ def fd_safe_executor(fds_per_task=2):
         max_descriptors_per_task = (
             get_fd_limit() - server_reserved_fd_count
         ) / conf.OPTIONS["Tasks"]["REGULAR_PRIORITY_WORKERS"]
-        # Each task only needs to have a maximum of `fds_per_task` open file descriptors at once.
-        # To add tolerance, we divide the number of file descriptors that could be allocated to
-        # this task by double this number which should give us leeway in case of unforeseen
-        # descriptor use during the process.
-        max_workers = min(
-            max_workers, max(1, max_descriptors_per_task // (fds_per_task * 2))
-        )
-
+    # Each task only needs to have a maximum of `fds_per_task` open file descriptors at once.
+    # To add tolerance, we divide the number of file descriptors that could be allocated to
+    # this task by 1.5 times this number which should give us leeway in case of unforeseen
+    # descriptor use during the process.
+    max_workers = min(
+        max_workers, max(1, max_descriptors_per_task // (fds_per_task * 2))
+    )
     return executor(max_workers=max_workers)