lists: Parallelize read_pkglist_heders_for_repo

Use multiprocessing module to read the headers for each
repo in parallel. Usualy reading data for repository means
loading 6 lists, so the gain is considerable.
This commit is contained in:
Ivan A. Melnikov 2023-12-05 13:27:43 +04:00
parent bbf24803fa
commit 8388c0627b

View File

@ -1,6 +1,7 @@
import collections
import logging
import multiprocessing
import os
import shutil
import subprocess
@ -84,9 +85,11 @@ def read_pkglist_headers_rpm(path):
def read_pkglist_heders_for_repo(repo_path, arches, components=None):
bin_headers = []
src_headers = []
bin_lists = []
src_lists = []
seen = set()
# collect the files
for arch in arches:
basedir = os.path.join(repo_path, arch, 'base')
for pkglist in os.listdir(basedir):
@ -103,9 +106,12 @@ def read_pkglist_heders_for_repo(repo_path, arches, components=None):
LOG.info('Ignoring %s/%s', basedir, pkglist)
continue
seen.add(what)
(src_headers if parts[0] == 'srclist' else bin_headers).extend(
read_pkglist_headers_rpm(os.path.join(basedir, pkglist)))
return src_headers, bin_headers
(src_lists if parts[0] == 'srclist' else bin_lists).append(
os.path.join(basedir, pkglist))
with multiprocessing.Pool() as p:
src_res = p.map_async(read_pkglist_headers_rpm, src_lists)
bin_res = p.map_async(read_pkglist_headers_rpm, bin_lists)
return sum(src_res.get(), []), sum(bin_res.get(), [])
def _read_pkglist_rpm(path):