lists: Parallelize read_pkglist_heders_for_repo

Use multiprocessing module to read the headers for each repo in parallel. Usualy reading data for repository means loading 6 lists, so the gain is considerable.
2023-12-05 13:27:43 +04:00 · 2023-12-05 13:27:43 +04:00 · 8388c0627b
commit 8388c0627b
parent bbf24803fa
1 changed files with 11 additions and 5 deletions
--- a/repos_cmp/lists.py
+++ b/repos_cmp/lists.py
@ -1,6 +1,7 @@

 import collections
 import logging
+import multiprocessing
 import os
 import shutil
 import subprocess
@ -84,9 +85,11 @@ def read_pkglist_headers_rpm(path):


 def read_pkglist_heders_for_repo(repo_path, arches, components=None):
-    bin_headers = []
-    src_headers = []
+    bin_lists = []
+    src_lists = []
    seen = set()
+
+    # collect the files
    for arch in arches:
        basedir = os.path.join(repo_path, arch, 'base')
        for pkglist in os.listdir(basedir):
@ -103,9 +106,12 @@ def read_pkglist_heders_for_repo(repo_path, arches, components=None):
                LOG.info('Ignoring %s/%s', basedir, pkglist)
                continue
            seen.add(what)
-            (src_headers if parts[0] == 'srclist' else bin_headers).extend(
-                read_pkglist_headers_rpm(os.path.join(basedir, pkglist)))
-    return src_headers, bin_headers
+            (src_lists if parts[0] == 'srclist' else bin_lists).append(
+                os.path.join(basedir, pkglist))
+    with multiprocessing.Pool() as p:
+        src_res = p.map_async(read_pkglist_headers_rpm, src_lists)
+        bin_res = p.map_async(read_pkglist_headers_rpm, bin_lists)
+        return sum(src_res.get(), []), sum(bin_res.get(), [])


 def _read_pkglist_rpm(path):