From 2a2544d3be0258ea55a9263e265cd770aa33662b Mon Sep 17 00:00:00 2001 From: Alexander Lakhin Date: Thu, 20 Feb 2014 09:32:26 +0400 Subject: [PATCH] Merge fill-repodb and prepare-repodb --- README | 8 +- analyze-repo-redundancy.py | 2 +- analyze-repodb.py | 6 +- fill-repodb.py | 788 ---------------------------- prepare-repodb.py | 1013 ++++++++++++++++++++++++++++++++---- 5 files changed, 909 insertions(+), 908 deletions(-) delete mode 100755 fill-repodb.py diff --git a/README b/README index 41d56f8..22dd25a 100644 --- a/README +++ b/README @@ -1,14 +1,12 @@ Порядок использования скриптов: 1. Настроить структуру репозиториев и пути в repo-analyze-config.xml -2. Заполнить базу данных информацией из репозиториев: -fill-repodb.py repo-analyze-config.xml +2. Заполнить базу данных информацией из репозиториев и подготовить её к анализу: +prepare-repodb.py repo-analyze-config.xml * Для ускорения можно отключить обаботку .so и их символов ключами -O -S соответственно. ** При использовании удалённых репозиториев рекомендуется применять кеш: -с cache-dir *** Cкрипт создаёт в текущем каталоге базу данных repo.db размером около 2 Гб (при полной обработке, для репозиториев Chrome). -3. Подготовить базу данных к анализу: -prepare-repodb.py -4. Выполнить анализ/проверки: +3. Выполнить анализ/проверки: analyze-repodb.py analyze-repo-redundancy.py i586kde.lst --repo rosa-dx-chrome-1.0/i586/main/release >i586-redundant.txt diff --git a/analyze-repo-redundancy.py b/analyze-repo-redundancy.py index f7cd960..f10d7cb 100755 --- a/analyze-repo-redundancy.py +++ b/analyze-repo-redundancy.py @@ -42,7 +42,7 @@ def main(args): for repo in options.repo[0]: print repo rid = c.execute(""" -SELECT id FROM repodirs WHERE name = ? OR path = ? +SELECT id FROM repodirs WHERE name = ? OR url = ? """, [repo, repo]).fetchall() if not rid: print 'Repository "%s" not found.' % repo diff --git a/analyze-repodb.py b/analyze-repodb.py index be1d920..3ba9dd4 100755 --- a/analyze-repodb.py +++ b/analyze-repodb.py @@ -31,7 +31,7 @@ def detect_broken_dependencies(dbc, dot_output): else: deps = all_broken[pkg_id]['deps'] if deps is not None: - for dep_id in deps: + for dep_id in sorted(deps.keys()): if deps[dep_id]['build_arch'] == build_arch: chains = build_dep_chains(dep_id, current_repodir, all_broken, build_arch, @@ -256,7 +256,7 @@ def analyze_partitioning(dbc): print '===' print 'Possible partitioning:' repodirs = dbc.execute(""" -SELECT id, name, sources, path FROM repodirs WHERE sources <> '.' ORDER BY id +SELECT id, name, sources FROM repodirs WHERE sources <> '.' ORDER BY id """).fetchall() for repodir in repodirs: (rd_id, rd_name) = (repodir[0], repodir[1]) @@ -348,7 +348,7 @@ class query_output: def __init__(self, dbc): self.dbc = dbc self.repodirs = dbc.execute(""" -SELECT id, name, sources, path FROM repodirs ORDER BY id +SELECT id, name, sources FROM repodirs ORDER BY id """).fetchall() for repodir in self.repodirs: (rd_id, rd_name) = (repodir[0], repodir[1]) diff --git a/fill-repodb.py b/fill-repodb.py deleted file mode 100755 index 06f8935..0000000 --- a/fill-repodb.py +++ /dev/null @@ -1,788 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -import os -import sys -import gettext -import argparse -import sqlite3 -import rpm -import re -import xml.etree.ElementTree as ET -import subprocess -import shutil -import time -import multiprocessing as mp -import gc -import urllib -from urlparse import urlparse, urlunparse - -gettext.install('urpm-tools') - -DB = 'repo.db' - -NUM_PROCESSES = 1 # number of CPU's (evaluated automatically) - -RPMFILEMODE_DIRECTORY = 0x4000 -RPMFILEMODE_EXECUTE = 0111 - -def get_files(url, ext): - filelist = [] - urlp = urlparse(url) - if urlp.scheme in ['ftp', 'http', 'https']: - return parse_index_html(wget_url(url, None), url, '.rpm') - dir_list = os.listdir(url) - for d in dir_list: - if d.endswith(ext): - filepath = os.path.normpath(os.path.join(url, d)) - filelist.append(filepath) - return filelist - -def parseargs(): - parser = argparse.ArgumentParser(description=_('extract packages metadata' - ' from RPM repositories')) - parser.add_argument('config', metavar='config', - help=_('path to repo-analyze-config.xml')) - parser.add_argument('-c', '--cache-dir', - help=_('path to cache directory')) - parser.add_argument('-O', '--no-shared-objects', action='store_true', - help=_('don\'t process shared objects')) - parser.add_argument('-S', '--no-so-symbols', action='store_true', - help=_('don\'t process shared object symbols')) - opts = parser.parse_args() - return opts - -def to_string(rpm, tag, val): - if type(val) == type([]): - if not(val): - return None - try: - return str(val).decode('utf-8') - except: - print >> sys.stderr, 'Invalid UTF-8 string!\n(%s:\n%s = "%s")\n' % \ - (rpm, tag, val) - return str(val).decode('utf-8', 'replace') - -def init_database(conn): - conn.executescript(""" -CREATE TABLE repodirs(id INTEGER PRIMARY KEY NOT NULL, - name TEXT UNIQUE, path TEXT, arch TEXT, sources TEXT); -CREATE TABLE repodir_depends(id INTEGER PRIMARY KEY NOT NULL, - repodir_id INTEGER, depend_repodir_name TEXT); -CREATE TABLE IF NOT EXISTS package_files(id INTEGER PRIMARY KEY NOT NULL, - package_id INTEGER NOT NULL, basename TEXT, path TEXT, - size INTEGER, mode INTEGER, - link_to_file_id INTEGER, link_to_path TEXT, mark TEXT); -CREATE TABLE package_requires_res(id INTEGER PRIMARY KEY NOT NULL, - package_id INTEGER, requires_id INTEGER, - provides_id INTEGER, dep_package_id INTEGER); -CREATE TABLE package_conflicts_res(id INTEGER PRIMARY KEY NOT NULL, - package_id INTEGER, conflicts_id INTEGER, - provides_id INTEGER, dep_package_id INTEGER); -CREATE TABLE package_obsoletes_res(id INTEGER PRIMARY KEY NOT NULL, - package_id INTEGER, obsoletes_id INTEGER, - provides_id INTEGER, dep_package_id INTEGER); -CREATE TABLE so_needed(id INTEGER PRIMARY KEY NOT NULL, - obj_file_id INTEGER, name TEXT); -CREATE TABLE so_needed_res(id INTEGER PRIMARY KEY NOT NULL, - so_needed_id INTEGER, dep_obj_file_id INTEGER, res_type INTEGER); -CREATE TABLE obj_symbols(id INTEGER PRIMARY KEY NOT NULL, - obj_file_id INTEGER, name TEXT, sym_type INTEGER); -CREATE TABLE obj_symbols_res(id INTEGER PRIMARY KEY NOT NULL, - obj_sym_id INTEGER, dep_obj_sym_id INTEGER, res_type INTEGER); -PRAGMA synchronous = OFF; -PRAGMA journal_mode = OFF; -""") - -def index_database(conn): - print 'Indexing the database...' - conn.executescript(""" -CREATE INDEX rd_name ON repodirs(name); -CREATE INDEX pkg_name ON packages(name); -CREATE INDEX pkg_nvra ON packages(nvra); -CREATE INDEX pkg_arch ON packages(arch); -CREATE INDEX pkg_group ON packages(rpm_group); -CREATE INDEX pkg_repodir ON packages(repodir_id); -CREATE INDEX pkg_rq_pkg_req ON package_requires_res(package_id, requires_id); -CREATE INDEX pkg_rq_pkg_prov ON package_requires_res(dep_package_id, provides_id); -CREATE INDEX pkg_cf_pkg_conf ON package_conflicts_res(package_id, conflicts_id); -CREATE INDEX pkg_cf_pkg_prov ON package_conflicts_res(dep_package_id, provides_id); -CREATE INDEX pkg_ob_pkg_obs ON package_obsoletes_res(package_id, obsoletes_id); -CREATE INDEX pkg_ob_pkg_prov ON package_obsoletes_res(dep_package_id, provides_id); -CREATE INDEX pkg_file_pkg_id ON package_files(package_id); -CREATE INDEX pkg_file_name ON package_files(basename); -CREATE INDEX pkg_file_path ON package_files(path); -CREATE INDEX pkg_file_mark ON package_files(mark); -CREATE INDEX so_needed_obj_id ON so_needed(obj_file_id); -CREATE INDEX so_needed_res_sn ON so_needed_res(so_needed_id); -CREATE INDEX symbols_obj_name_type ON obj_symbols(obj_file_id, name, sym_type); -CREATE INDEX symbols_name_type ON obj_symbols(name, sym_type); -CREATE INDEX symbols_res_sym ON obj_symbols_res(obj_sym_id); -""") - dep_tables = ['rpm_requires', 'rpm_provides', - 'rpm_conflicts', 'rpm_obsoletes'] - for table in dep_tables: - conn.execute('CREATE INDEX %(tbl)s_pkg ON %(tbl)s(package_id)' % - {'tbl': table}) - conn.execute('CREATE INDEX %(tbl)s_name ON %(tbl)s(name)' % - {'tbl': table}) - conn.commit() - -def add_repodir(xrepodir, conn): - dbc = conn.cursor() - dbc.execute(""" -INSERT INTO repodirs (name, path, sources) VALUES (?, ?, ?) -""", [xrepodir.get('name'), xrepodir.get('path'), xrepodir.get('sources')]) - repodir_id = dbc.lastrowid - for depend in xrepodir.findall('dependency'): - dbc.execute(""" -INSERT INTO repodir_depends(repodir_id, depend_repodir_name) VALUES (?, ?) -""", [repodir_id, depend.text.strip()]) - conn.commit() - return repodir_id - -def get_build_archs(xrepodir, xrepodirs): - build_archs = [] - for depend in xrepodir.findall('dependency'): - arch_sign = '$arch' - depend_repo = depend.text.strip() - spos = depend_repo.find(arch_sign) - if spos >= 0: - drepo_prefix = depend_repo[:spos] - drepo_postfix = depend_repo[spos + len(arch_sign):] - for xrepodir in xrepodirs.findall('dir'): - repo_name = xrepodir.get('name') - if repo_name.startswith(drepo_prefix) and \ - repo_name.endswith(drepo_postfix): - repo_arch = repo_name[len(drepo_prefix) : - len(repo_name) - len(drepo_postfix)] - if repo_arch == 'SRPMS': - continue - if repo_arch not in build_archs: - build_archs.append(repo_arch) - if build_archs: - return build_archs - return [None] - -def get_rpm_header(rpm_ts, pkg): - hdr = None - try: - fdno = os.open(pkg, os.O_RDONLY) - except OSError as exc: - raise Exception('Unable to open file %s.\n%s' % (pkg, exc)) - try: - hdr = rpm_ts.hdrFromFdno(fdno) - except rpm.error as exc: - raise Exception('Unable to read RPM header for %s\n%s.' % (pkg, exc)) - finally: - os.close(fdno) - return hdr - -def generate_new_id(generator, gen_lock): - gen_lock.acquire() - last_id = generator.value - last_id += 1 - generator.value = last_id - gen_lock.release() - return last_id - -FILE_REC_ID_IDX = 0 -FILE_REC_PATH_IDX = 3 -FILE_REC_LINK_IDX = 6 -FILE_REC_MARK_IDX = 7 - -def register_object(data, object_file_record, temp_dir, no_so_symbols): - so_needed = data['so_needed'] - obj_symbols = data['obj_symbols'] - obj_id = object_file_record[0] - obj_file_path = object_file_record[3] - temp_obj_file = os.path.join(temp_dir, obj_file_path.lstrip('/')) - - target_file = None - file_mark = None - od_out = '' - nmundef_out = '' - nmdef_out = '' - if os.path.islink(temp_obj_file): - target_file = os.path.join(os.path.dirname(obj_file_path), - os.readlink(temp_obj_file)) - file_mark = 'link' - elif not os.path.exists(temp_obj_file): - file_mark = 'not-found' - else: - p = subprocess.Popen(['objdump', '-p', temp_obj_file], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - od_out = p.communicate()[0] - if p.returncode != 0: - file_mark = 'invalid-format' - elif not(no_so_symbols): - p = subprocess.Popen(['nm', '-p', '-D', '--undefined-only', - temp_obj_file], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - nmundef_out = p.communicate()[0] - if p.returncode != 0: - file_mark = 'no-symbols' - else: - p = subprocess.Popen(['nm', '-p', '-D', '--defined-only', - temp_obj_file], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - nmdef_out = p.communicate()[0] - if p.returncode != 0: - file_mark = 'no-symbols' - else: - file_mark = 'so' - - object_file_record[FILE_REC_LINK_IDX] = target_file - object_file_record[FILE_REC_MARK_IDX] = file_mark - - dynsection = False - for odline in od_out.split('\n'): - odls = odline.strip() - if odls == '': - dynsection = False - elif odls == 'Динамический раздел:' or odls == 'Dynamic section:': - dynsection = True - elif dynsection: - needrem = re.match(r'\s+NEEDED\s+(.*)', odline) - if needrem: - so_needed.append([obj_id, needrem.group(1)]) - - for symline in nmundef_out.split('\n'): - smre = re.match(r'^.([\S]*)\s+(\w)\s(.*)$', symline) - if smre: - if smre.group(2) in ['v', 'w']: - continue - symname = smre.group(3) - obj_symbols.append([obj_id, symname, 0]) - - for symline in nmdef_out.split('\n'): - smre = re.match(r'^.([\S]*)\s+(\w)\s(.*)$', symline) - if smre: - symname = smre.group(3) - obj_symbols.append([obj_id, symname, 1]) - - return obj_id - -def extract_files(pkg, files_list, obj_so_files_idx, temp_dir): - #local_pkg = getLocalPackageName(pkg) - local_pkg = pkg - filelist = os.path.join(temp_dir, 'files.lst') - with open(filelist, 'w') as f: - for i in obj_so_files_idx: - f.write('.' + files_list[i][FILE_REC_PATH_IDX] + '\n') - - rpm_cpio_cmd = 'rpm2cpio ' + local_pkg + ' | cpio -ivdu -E ' + filelist - p = subprocess.Popen(rpm_cpio_cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - cwd=temp_dir, - shell=True) - output = p.communicate()[0] - if p.returncode != 0: - print >> sys.stderr, 'Couldn\'t extract files from package %s.' \ - '\n\t%s' % (pkg, output) - return False - return True - -def process_package_worker(num, queue_in, generator, gen_lock, db_struct, - repodir_id, build_archs, temp_dir, - no_shared_objects, no_so_symbols): - - rpm_ts = rpm.TransactionSet() - rpm_ts.setVSFlags(~(rpm.RPMVSF_NEEDPAYLOAD)) - data = {} - data['packages'] = [] - for table in db_struct['dep_tables']: - data[table] = [] - data['package_files'] = [] - data['so_needed'] = [] - data['obj_symbols'] = [] - - while True: - job = queue_in.get() - if job is None: - break - (pkg, ) = job - - pkg_id = generate_new_id(generator, gen_lock) - local_pkg = get_local_file(pkg, temp_dir) - - hdr = get_rpm_header(rpm_ts, local_pkg) - package_values = [] - package_values.append(pkg_id) - - for tag in db_struct['packages_tags']: - hval = hdr[tag] - package_values.append( - (sqlite3.Binary(hval) if len(hval)>0 else None) - if tag in db_struct['blob_tags'] else \ - to_string(pkg, tag, hval) if \ - type(hval) in [type([]), type('')] else hval - ) - package_values.append(repodir_id) - package_values.append(pkg) - package_values.append(None) - data['packages'].append(package_values) - for table in db_struct['dep_tables']: - table_data = data[table] - rpref = 'RPMTAG_' + table[4 : -1].upper() # rpm_requires - (dep_name, dep_flags, dep_version) = \ - (hdr[rpref + 'NAME'], hdr[rpref + 'FLAGS'], hdr[rpref + 'VERSION']) - for i in xrange(0, len(hdr[rpref + 'NAME'])): - for build_arch in build_archs: - table_data.append([dep_name[i].decode('utf-8'), - dep_flags[i], - dep_version[i], - pkg_id, build_arch]) - (pkg_file_paths, pkg_file_names, pkg_file_sizes, pkg_file_modes) = \ - (hdr['RPMTAG_FILEPATHS'], hdr['RPMTAG_BASENAMES'], - hdr['RPMTAG_FILESIZES'], hdr['RPMTAG_FILEMODES']) - files_list = data['package_files'] - files_dirs = {} - obj_so_files_idx = [] - for i in xrange(0, len(pkg_file_paths)): - file_name = pkg_file_names[i] - file_path = pkg_file_paths[i] - pkg_file_id = generate_new_id(generator, gen_lock) - files_list.append([pkg_file_id, #FILE_REC_ID_IDX = 0 - pkg_id, - file_name.decode('utf-8'), - file_path.decode('utf-8'), #FILE_REC_PATH_IDX = 3 - pkg_file_sizes[i], - pkg_file_modes[i], - None, #link_to_path FILE_REC_LINK_IDX = 6 - None #mark FILE_REC_LINK_IDX = 7 - ]) - if pkg_file_modes[i] & RPMFILEMODE_DIRECTORY != 0: - files_dirs[file_path] = False - continue - dir_name = os.path.dirname(file_path) - if dir_name != '' and dir_name not in files_dirs: - files_dirs[dir_name] = True - if no_shared_objects: - continue - if os.path.splitext(file_name)[1] in \ - ['.debug', '.xz', '.conf', '.py', '.c', '.h', '.hpp', '.png', - '.cc', '.cpp', '.sh', '.java', '.pl', '.patch', '.desktop']: - continue - if file_path.startswith('/usr/lib/debug/.build-id') or \ - file_path.endswith('/ld.so.cache'): - continue - if re.search(r'\.so($|\.)', file_name) or \ - (pkg_file_modes[i] & RPMFILEMODE_EXECUTE) != 0: - obj_so_files_idx.append(len(files_list) - 1) - - for fdir in sorted(files_dirs.keys()): - if files_dirs[fdir]: - # Add parent directories as implicit files - # TODO: recursive processing? - pkg_file_id = generate_new_id(generator, gen_lock) - files_list.append([pkg_file_id, #FILE_REC_ID_IDX = 0 - pkg_id, - os.path.basename(fdir), - fdir, #FILE_REC_PATH_IDX = 3 - 0, - -1, # special mode - None, #link_to_path FILE_REC_LINK_IDX = 6 - None #mark FILE_REC_LINK_IDX = 7 - ]) - - if obj_so_files_idx: - pkg_temp_dir = os.path.join(temp_dir, os.path.basename(local_pkg)) - os.makedirs(pkg_temp_dir) - if extract_files(local_pkg, files_list, - obj_so_files_idx, pkg_temp_dir): - for i in obj_so_files_idx: - register_object(data, files_list[i], pkg_temp_dir, - no_so_symbols) - - shutil.rmtree(pkg_temp_dir, True) - - remove_cached_file(pkg) - queue_in.task_done() - - conn = sqlite3.connect(DB, timeout=30) - conn.executemany(""" -INSERT INTO packages (%s) VALUES (%s)""" % - (db_struct['packages_field_names'], - db_struct['packages_values_template']), - data['packages']) - - for table in db_struct['dep_tables']: - conn.executemany(""" -INSERT INTO %s (name, flags, version, package_id, build_arch) -VALUES (?, ?, ?, ?, ?)""" % table, data[table]) - - conn.executemany(""" -INSERT INTO package_files (id, package_id, basename, path, size, mode, link_to_path, mark) -VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", data['package_files']) - - conn.executemany(""" -INSERT INTO so_needed(obj_file_id, name) VALUES(?, ?) -""", data['so_needed']) - - conn.executemany(""" -INSERT INTO obj_symbols(obj_file_id, name, sym_type) VALUES(?, ?, ?) -""", data['obj_symbols']) - - conn.commit() - queue_in.task_done() - -local_cache = {} -def get_local_file(url, temp_dir): - urlp = urlparse(url) - if urlp.scheme in ['ftp', 'http', 'https']: - cached_file_name = local_cache.get(url) - if cached_file_name and os.path.isfile(cached_file_name): - return cached_file_name - cache_dir = os.path.join(temp_dir, 'cache') - if not os.path.isdir(cache_dir): - os.makedirs(cache_dir) - temp_file = os.path.join(cache_dir, os.path.basename(url)) - wget_url(url, temp_file) - local_cache[url] = temp_file - return temp_file - return url - -def remove_cached_file(url): - cached_file_name = local_cache.get(url) - if cached_file_name: - os.unlink(cached_file_name) - del local_cache[url] - -def wget_url(url, target_file): - urlp = urlparse(url) - wget_params = [] - site = urlp.netloc - if urlp.username: - wget_params = wget_params + ['--auth-no-challenge', - '--http-user=%s' % urlp.username, - '--http-password=%s' % - ('""' if not urlp.password else urlp.password)] - site = site[site.find('@') + 1:] - url = urlunparse((urlp.scheme, site, urlp.path, urlp.params, - urlp.query, urlp.fragment)) - print 'Downloading %s...' % url - if target_file is None: - wget_params += ['-nv', '-O-', url] - else: - wget_params += ['-nv', '-O', target_file, url] - p = subprocess.Popen(['wget'] + wget_params, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - wget_out, wget_err = p.communicate() - if p.returncode != 0: - print >> sys.stderr, ('Unable to get data from the url: %s ' - '(error: %d).\n%s\n%s') % \ - (url, p.returncode, wget_out, wget_err) - raise Exception('Unable to download data (%d).' % p.returncode) - if target_file is None: - return wget_out - -def parse_index_html(index_html, base_url, filter_ext): - file_list = [] - for match in re.finditer(r'href="([^"]+)"', index_html, re.M): - filename = match.group(1) - if filename.endswith(filter_ext): - if '://' in filename[:8]: - file_list.append(filename) - continue - filepath = os.path.join(base_url, filename) - if os.path.dirname(filepath) == base_url.rstrip('/') and \ - os.path.basename(filepath) == filename: - file_list.append(filepath) - return file_list - -def download_repodir(source_urlp, cache_dir): - site = source_urlp.netloc - site = site[site.find('@') + 1:] - target_dir = os.path.join(cache_dir, - site, - source_urlp.path.lstrip('/')) - if not os.path.isdir(target_dir): - os.makedirs(target_dir) - remote_files = {} - if source_urlp.scheme in ['ftp', 'http', 'https']: - source_url = source_urlp.geturl() - remote_dir_contents = parse_index_html(wget_url(source_url, None), - source_url, '.rpm') - for remote_file in remote_dir_contents: - remote_filename = urllib.unquote(os.path.basename(remote_file)) - remote_files[remote_filename] = True - target_file = os.path.join(target_dir, remote_filename) - if os.path.isfile(target_file): - continue - wget_url(remote_file, target_file) - - for local_filename in os.listdir(target_dir): - if local_filename not in remote_files and \ - local_filename.endswith('.rpm'): - print 'Removing local file: %s.' % local_filename - os.unlink(os.path.join(target_dir, local_filename)) - - return target_dir - -def urpm_get_packages(media): - extra_params = [] - if not media.endswith(' update'): - extra_params = ['--exclude-media', media + ' update'] - p = subprocess.Popen(['urpmq', '-r', '--ignorearch', - '--list', '--media', media] + - extra_params, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - urpmqr_out, urpmqr_err = p.communicate() - if p.returncode != 0 or len(urpmqr_err) > 0: - print >> sys.stderr, ('Unable to get a list of packages ' - 'from the media: %s.\n' - '%s\n%s') % (media, urpmqr_out, urpmq_err) - raise Exception('Unable to get a list of packages (%d).' % p.returncode) -# urpmi --no-install --allow-nodeps --force -# --download-all=/tmp/ xine-wavpack-1.2.4-1plf --media Desktop2012.1-8 - p = subprocess.Popen(['urpmq', '-f', '--ignorearch', - '--list', '--media', media] + - extra_params, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - urpmqf_out, urpmqf_err = p.communicate() - if p.returncode != 0 or len(urpmqf_err) > 0: - print >> sys.stderr, ('Unable to get a list of packages ' - 'from the media: %s.\n' - '%s\n%s') % (media, urpmqf_out, urpmqf_err) - raise Exception('Unable to get a list of packages (%d).' % p.returncode) - - rpm_list = [] - qr_lines = urpmqr_out.split('\n') - qf_lines = urpmqf_out.split('\n') - if len(qr_lines) != len(qf_lines): - raise Exception('Not consistent urpmq -r and urpmq -f outputs ' - '(%d and %d lines).' % (len(qr_lines), len(qf_lines))) - for i in xrange(0, len(qf_lines)): - qf_line = qf_lines[i] - if qf_line.strip() == '': - continue - if not qf_line.startswith(qr_lines[i]): - raise Exception('Not consistent urpmq -r and urpmq -f outputs: ' - '%s and %s' % (qr_lines[i], qf_line)) - rpm_list.append('urpm://%s/%s.rpm#%s' % (urllib.quote(media), - urllib.quote(qf_line), - urllib.quote(qr_lines[i]))) - return rpm_list - - -def get_urpmi(urpm_package, target_dir): - urlp = urlparse(urpm_package) - package_name = urllib.unquote(urlp.fragment) - print package_name - p = subprocess.Popen(['urpmi', '--no-install', - '--force', '--no-suggests', - '--allow-nodeps', - '--no-download-all', - '--media', urlp.netloc, - package_name]) - #stdout=subprocess.PIPE, - #stderr=subprocess.PIPE) - urpmi_out, urpmi_err = p.communicate() - if p.returncode != 0: - print >> sys.stderr, ('Unable to get the package %s ' - 'from the media %s.\n' - '%s\n%s') % ( - package_name, urlp.netloc, - urpmi_out, urpmi_err) - raise Exception('Unable to get the package %s (%d).' % - (package_name, p.returncode)) - - -def urpm_get_repodir(repodir_name, cache_dir): - target_dir = os.path.join(cache_dir, - repodir_name, - 'rpms') - if not os.path.isdir(target_dir): - os.makedirs(target_dir) - urpm_files = {} - urpm_media_contents = urpm_get_packages(repodir_name) - for urpm_package in urpm_media_contents: - remote_filename = urllib.unquote(os.path.basename(urpm_package)) - target_file = os.path.join(target_dir, remote_filename) - get_urpmi(urpm_package, os.path.join(cache_dir, - repodir_name)) - print target_file - raise Exception('Not implemented.') - -generator_value = 0 - -def process_repodir(xrepodir, repodir_id, cache_dir, build_archs, conn, - db_struct, temp_dir, no_shared_objects, no_so_symbols): - - repodir_url = xrepodir.get('url') - urlp = urlparse(repodir_url) - working_url = repodir_url - if cache_dir is not None: - if urlp.scheme in ['ftp', 'http', 'https']: - working_url = download_repodir(urlp, cache_dir) - elif urlp.scheme == 'urpm': - working_url = urpm_get_repodir(xrepodir.get('name'), cache_dir) - elif urlp.scheme not in ['', 'file']: - raise Exception('Invalid scheme in the repository url: %s' % - repodir_url) - rpm_list = [] - rpm_list = get_files(working_url, '.rpm') - if not rpm_list: - return - print urlp.netloc[urlp.netloc.find('@') + 1:] + urlp.path, ': ', \ - len(rpm_list) - if not db_struct.get('defined'): - rpm_ts = rpm.TransactionSet() - rpm_ts.setVSFlags(~(rpm.RPMVSF_NEEDPAYLOAD)) - # ts.setVSFlags(~(rpm.RPMVSF_NOMD5|rpm.RPMVSF_NEEDPAYLOAD)) - hdr = get_rpm_header(rpm_ts, get_local_file(rpm_list[0], temp_dir)) - - # Retain sort order! - packages_extra_fields = {'repodir_id': 'INTEGER', - 'rpm_url': 'TEXT', - 'sourcerpm_package': 'TEXT'} - - file_tags_re = r'^RPMTAG_(BASENAMES|FILE[\w\d]+)' - dir_tags_re = r'^RPMTAG_DIR(INDEXES|NAMES)' - changelog_tags_re = r'^RPMTAG_CHANGELOG\w+' - trigger_tags_re = r'^RPMTAG_TRIGGER\w+' - - datetime_tags = ['RPMTAG_PACKAGETIME', 'RPMTAG_RPMLIBTIMESTAMP', ] - db_struct['blob_tags'] = ['RPMTAG_RSAHEADER', 'RPMTAG_DSAHEADER', - 'RPMTAG_HEADERIMMUTABLE', 'RPMTAG_SIGMD5', - 'RPMTAG_PKGID', 'RPMTAG_SOURCEPKGID'] - - reserved_field_names = ['id', 'group'] - skip_tags_re = '^RPMTAG_(C|D|E|N|P|R|V|HEADERIMMUTABLE)$' - #C - CONFLICTNAME, D - DISTEPOCH, E - EPOCH, N - NAME, O - OBSOLETENAME - #P - PROVIDENAME, R - RELEASE, V - VERSION - - types = {"" : "TEXT", "": "INTEGER", - "": "TEXT", "": "TEXT"} - - dep_tags_re = r'^RPMTAG_(CONFLICT|OBSOLETE|PROVIDE|REQUIRE)\w+' - - db_struct['dep_tables'] = ['rpm_requires', 'rpm_provides', - 'rpm_conflicts', 'rpm_obsoletes'] - - packages_field_names = 'id, ' - packages_values_template = '?,' - packages_tags = [] - packages_fields = '' - - rpmtags = [str(t) for t in dir(rpm) if t.startswith('RPMTAG_') ] - for tag in rpmtags: - if (re.match(file_tags_re, tag) or re.match(dir_tags_re, tag) or - re.match(changelog_tags_re, tag) or - re.match(skip_tags_re, tag) or - re.match(trigger_tags_re, tag) or - re.match(dep_tags_re, tag)): - continue - sqltype = "TIMESTAMP" if tag in datetime_tags else \ - "BLOB" if tag in db_struct['blob_tags'] else \ - types[str(type(hdr[tag]))] - fieldname = tag.replace('RPMTAG_', '').lower() - if fieldname in reserved_field_names: - fieldname = 'rpm_' + fieldname - packages_tags.append(tag) - packages_field_names += fieldname + ', ' - packages_values_template += '?, ' - packages_fields += fieldname + ' ' + sqltype + ', ' - nef = 0 - for extra_field in sorted(packages_extra_fields.keys()): - packages_field_names += (', ' if nef > 0 else '') + extra_field - packages_values_template += (', ' if nef > 0 else '') + '?' - packages_fields += (', ' if nef > 0 else '') + extra_field + ' ' + \ - packages_extra_fields[extra_field] - nef += 1 - conn.execute(""" -CREATE TABLE IF NOT EXISTS packages(id INTEGER PRIMARY KEY NOT NULL, %s) -""" % (packages_fields)) - for table in db_struct['dep_tables']: - conn.execute(""" -CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY NOT NULL, - name TEXT, flags INTEGER, version TEXT, build_arch TEXT, - package_id INTEGER NOT NULL)""" % (table)) - conn.commit() - db_struct['packages_tags'] = packages_tags - db_struct['packages_field_names'] = packages_field_names - db_struct['packages_values_template'] = packages_values_template - db_struct['defined'] = True - - - queue_in = mp.JoinableQueue() - for pkg in rpm_list: - queue_in.put((pkg, )) - - for i in xrange(NUM_PROCESSES): - queue_in.put(None) - - # Trying to prevent Exception AssertionError: AssertionError() in - # ignored - gc.collect() - time.sleep(1) - gc.disable() - global generator_value - id_generator = mp.Value('i', generator_value) - generator_lock = mp.Lock() - # run workers - workers = [] - for i in xrange(NUM_PROCESSES): - worker = mp.Process(target = process_package_worker, - args = (i, queue_in, id_generator, - generator_lock, db_struct, - repodir_id, build_archs, temp_dir, - no_shared_objects, no_so_symbols)) - workers.append(worker) - worker.start() - queue_in.join() - gc.enable() - generator_value = id_generator.value - - -def main(args): - global NUM_PROCESSES - - if os.path.exists(DB): - os.unlink(DB) - - if hasattr(os, "sysconf"): - if os.sysconf_names.has_key("SC_NPROCESSORS_ONLN"): - nproc = os.sysconf("SC_NPROCESSORS_ONLN") - if isinstance(nproc, int) and nproc > 0: - NUM_PROCESSES = nproc - - conn = sqlite3.connect(DB) - init_database(conn) - conn.commit() - - options = parseargs() - parser = ET.XMLParser() - tree = ET.parse(options.config, parser=parser) - config_root = tree.getroot() - temp_dir = '/dev/shm/rt-tmp/' - shutil.rmtree(temp_dir, True) - os.mkdir(temp_dir) - rpm_db_struct = {} - for xrepodir in config_root.find('repositories').findall('dir'): - repodir_id = add_repodir(xrepodir, conn) - build_archs = [None] if xrepodir.get('sources') != '.' else \ - get_build_archs(xrepodir, - config_root.find('repositories')) - process_repodir(xrepodir, repodir_id, options.cache_dir, - build_archs, conn, rpm_db_struct, temp_dir, - options.no_shared_objects, options.no_so_symbols) - shutil.rmtree(temp_dir, True) - if rpm_db_struct.get('defined'): - index_database(conn) - else: - print 'Database was not initialized ' \ - '(check whether repositories are empty).' - os.unlink(DB) - - -if __name__ == "__main__": - main(sys.argv) diff --git a/prepare-repodb.py b/prepare-repodb.py index 946dcbf..379675b 100755 --- a/prepare-repodb.py +++ b/prepare-repodb.py @@ -6,20 +6,792 @@ import sys import gettext import argparse import sqlite3 -import re import rpm +import re +import xml.etree.ElementTree as ET +import subprocess +import shutil +import time +import multiprocessing as mp +import gc +import urllib +from urlparse import urlparse, urlunparse + +gettext.install('urpm-tools') + +DB = 'repo.db' + +NUM_PROCESSES = 1 # number of CPU's (evaluated automatically) RPMSENSE_LESS = 0x02 RPMSENSE_GREATER = 0x04 RPMSENSE_EQUAL = 0x08 RPMSENSE_SENSEMASK = 0x0f -RPMSENSE_FIND_PROVIDES = 0x8000 -RPMSENSE_MISSINGOK = 0x80000 RPMSENSE_SCRIPT_POST = 0x400 RPMSENSE_SCRIPT_PREUN = 0x800 RPMSENSE_SCRIPT_POSTUN = 0x1000 +RPMSENSE_FIND_PROVIDES = 0x8000 +RPMSENSE_MISSINGOK = 0x80000 -DB = 'repo.db' +RPMFILEMODE_DIRECTORY = 0x4000 +RPMFILEMODE_EXECUTE = 0111 + +def parse_args(): + parser = argparse.ArgumentParser(description=_('extract packages metadata' + ' from RPM repositories')) + parser.add_argument('config', metavar='config', + help=_('path to repo-analyze-config.xml')) + parser.add_argument('-c', '--cache-dir', + help=_('path to cache directory')) + parser.add_argument('-O', '--no-shared-objects', action='store_true', + help=_('don\'t process shared objects')) + parser.add_argument('-S', '--no-so-symbols', action='store_true', + help=_('don\'t process shared object symbols')) + opts = parser.parse_args() + return opts + +################################################# +# Fill database with the repositories data +################################################ + +def init_database(conn): + conn.executescript(""" +CREATE TABLE repodirs(id INTEGER PRIMARY KEY NOT NULL, + name TEXT UNIQUE, url TEXT, arch TEXT, sources TEXT); +CREATE TABLE repodir_depends(id INTEGER PRIMARY KEY NOT NULL, + repodir_id INTEGER, depend_repodir_name TEXT); +CREATE TABLE IF NOT EXISTS package_files(id INTEGER PRIMARY KEY NOT NULL, + package_id INTEGER NOT NULL, basename TEXT, path TEXT, + size INTEGER, mode INTEGER, + link_to_file_id INTEGER, link_to_path TEXT, mark TEXT); +CREATE TABLE package_requires_res(id INTEGER PRIMARY KEY NOT NULL, + package_id INTEGER, requires_id INTEGER, + provides_id INTEGER, dep_package_id INTEGER); +CREATE TABLE package_conflicts_res(id INTEGER PRIMARY KEY NOT NULL, + package_id INTEGER, conflicts_id INTEGER, + provides_id INTEGER, dep_package_id INTEGER); +CREATE TABLE package_obsoletes_res(id INTEGER PRIMARY KEY NOT NULL, + package_id INTEGER, obsoletes_id INTEGER, + provides_id INTEGER, dep_package_id INTEGER); +CREATE TABLE so_needed(id INTEGER PRIMARY KEY NOT NULL, + obj_file_id INTEGER, name TEXT); +CREATE TABLE so_needed_res(id INTEGER PRIMARY KEY NOT NULL, + so_needed_id INTEGER, dep_obj_file_id INTEGER, res_type INTEGER); +CREATE TABLE obj_symbols(id INTEGER PRIMARY KEY NOT NULL, + obj_file_id INTEGER, name TEXT, sym_type INTEGER); +CREATE TABLE obj_symbols_res(id INTEGER PRIMARY KEY NOT NULL, + obj_sym_id INTEGER, dep_obj_sym_id INTEGER, res_type INTEGER); +PRAGMA synchronous = OFF; +PRAGMA journal_mode = OFF; +PRAGMA cache_size = -1048576; +""") + conn.commit() + +def index_database(conn): + print 'Indexing the database...' + conn.executescript(""" +CREATE INDEX rd_name ON repodirs(name); +CREATE INDEX pkg_name ON packages(name); +CREATE INDEX pkg_nvra ON packages(nvra); +CREATE INDEX pkg_arch ON packages(arch); +CREATE INDEX pkg_group ON packages(rpm_group); +CREATE INDEX pkg_repodir ON packages(repodir_id); +CREATE INDEX pkg_rq_pkg_req ON package_requires_res(package_id, requires_id); +CREATE INDEX pkg_rq_pkg_prov ON package_requires_res(dep_package_id, provides_id); +CREATE INDEX pkg_cf_pkg_conf ON package_conflicts_res(package_id, conflicts_id); +CREATE INDEX pkg_cf_pkg_prov ON package_conflicts_res(dep_package_id, provides_id); +CREATE INDEX pkg_ob_pkg_obs ON package_obsoletes_res(package_id, obsoletes_id); +CREATE INDEX pkg_ob_pkg_prov ON package_obsoletes_res(dep_package_id, provides_id); +CREATE INDEX pkg_file_pkg_id ON package_files(package_id); +CREATE INDEX pkg_file_name ON package_files(basename); +CREATE INDEX pkg_file_path ON package_files(path); +CREATE INDEX pkg_file_mark ON package_files(mark); +CREATE INDEX so_needed_obj_id ON so_needed(obj_file_id); +CREATE INDEX so_needed_res_sn ON so_needed_res(so_needed_id); +CREATE INDEX symbols_obj_name_type ON obj_symbols(obj_file_id, name, sym_type); +CREATE INDEX symbols_name_type ON obj_symbols(name, sym_type); +CREATE INDEX symbols_res_sym ON obj_symbols_res(obj_sym_id); +""") + dep_tables = ['rpm_requires', 'rpm_provides', + 'rpm_conflicts', 'rpm_obsoletes'] + for table in dep_tables: + conn.execute('CREATE INDEX %(tbl)s_pkg ON %(tbl)s(package_id)' % + {'tbl': table}) + conn.execute('CREATE INDEX %(tbl)s_name ON %(tbl)s(name)' % + {'tbl': table}) + conn.commit() + +def get_rpm_header(rpm_ts, pkg): + hdr = None + try: + fdno = os.open(pkg, os.O_RDONLY) + except OSError as exc: + raise Exception('Unable to open file %s.\n%s' % (pkg, exc)) + try: + hdr = rpm_ts.hdrFromFdno(fdno) + except rpm.error as exc: + raise Exception('Unable to read RPM header for %s\n%s.' % (pkg, exc)) + finally: + os.close(fdno) + return hdr + +def generate_new_id(generator, gen_lock): + gen_lock.acquire() + last_id = generator.value + last_id += 1 + generator.value = last_id + gen_lock.release() + return last_id + +FILE_REC_ID_IDX = 0 +FILE_REC_PATH_IDX = 3 +FILE_REC_LINK_IDX = 6 +FILE_REC_MARK_IDX = 7 + +def register_object(data, object_file_record, temp_dir, no_so_symbols): + so_needed = data['so_needed'] + obj_symbols = data['obj_symbols'] + obj_id = object_file_record[0] + obj_file_path = object_file_record[3] + temp_obj_file = os.path.join(temp_dir, obj_file_path.lstrip('/')) + + target_file = None + file_mark = None + od_out = '' + nmundef_out = '' + nmdef_out = '' + if os.path.islink(temp_obj_file): + target_file = os.path.join(os.path.dirname(obj_file_path), + os.readlink(temp_obj_file)) + file_mark = 'link' + elif not os.path.exists(temp_obj_file): + file_mark = 'not-found' + else: + p = subprocess.Popen(['objdump', '-p', temp_obj_file], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + od_out = p.communicate()[0] + if p.returncode != 0: + file_mark = 'invalid-format' + elif not(no_so_symbols): + p = subprocess.Popen(['nm', '-p', '-D', '--undefined-only', + temp_obj_file], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + nmundef_out = p.communicate()[0] + if p.returncode != 0: + file_mark = 'no-symbols' + else: + p = subprocess.Popen(['nm', '-p', '-D', '--defined-only', + temp_obj_file], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + nmdef_out = p.communicate()[0] + if p.returncode != 0: + file_mark = 'no-symbols' + else: + file_mark = 'so' + + object_file_record[FILE_REC_LINK_IDX] = target_file + object_file_record[FILE_REC_MARK_IDX] = file_mark + + dynsection = False + for odline in od_out.split('\n'): + odls = odline.strip() + if odls == '': + dynsection = False + elif odls == 'Динамический раздел:' or odls == 'Dynamic section:': + dynsection = True + elif dynsection: + needrem = re.match(r'\s+NEEDED\s+(.*)', odline) + if needrem: + so_needed.append([obj_id, needrem.group(1)]) + + for symline in nmundef_out.split('\n'): + smre = re.match(r'^.([\S]*)\s+(\w)\s(.*)$', symline) + if smre: + if smre.group(2) in ['v', 'w']: + continue + symname = smre.group(3) + obj_symbols.append([obj_id, symname, 0]) + + for symline in nmdef_out.split('\n'): + smre = re.match(r'^.([\S]*)\s+(\w)\s(.*)$', symline) + if smre: + symname = smre.group(3) + obj_symbols.append([obj_id, symname, 1]) + + return obj_id + +def extract_files(local_pkg, files_list, obj_so_files_idx, temp_dir): + filelist = os.path.join(temp_dir, 'files.lst') + with open(filelist, 'w') as f: + for i in obj_so_files_idx: + f.write('.' + files_list[i][FILE_REC_PATH_IDX] + '\n') + + rpm_cpio_cmd = 'rpm2cpio ' + local_pkg + ' | cpio -ivdu -E ' + filelist + p = subprocess.Popen(rpm_cpio_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + cwd=temp_dir, + shell=True) + output = p.communicate()[0] + if p.returncode != 0: + print >> sys.stderr, 'Couldn\'t extract files from package %s.' \ + '\n\t%s' % (local_pkg, output.decode('utf-8')) + return False + return True + +def process_package_worker(num, queue_in, generator, gen_lock, db_struct, + repodir_id, build_archs, temp_dir, + no_shared_objects, no_so_symbols): + + def to_string(pkg, tag, val): + if type(val) == type([]): + if not(val): + return None + try: + return str(val).decode('utf-8') + except: + print >> sys.stderr, ('Invalid UTF-8 string (%s: %s):\n' % + (pkg, tag)) + print >> sys.stderr, val + return str(val).decode('utf-8', 'replace') + + + rpm_ts = rpm.TransactionSet() + rpm_ts.setVSFlags(~(rpm.RPMVSF_NEEDPAYLOAD)) + data = {} + data['packages'] = [] + for table in db_struct['dep_tables']: + data[table] = [] + data['package_files'] = [] + data['so_needed'] = [] + data['obj_symbols'] = [] + + while True: + job = queue_in.get() + if job is None: + break + (pkg, ) = job + + pkg_id = generate_new_id(generator, gen_lock) + local_pkg = get_local_file(pkg, temp_dir) + + hdr = get_rpm_header(rpm_ts, local_pkg) + package_values = [] + package_values.append(pkg_id) + + for tag in db_struct['packages_tags']: + hval = hdr[tag] + package_values.append( + (sqlite3.Binary(hval) if len(hval)>0 else None) + if tag in db_struct['blob_tags'] else \ + to_string(pkg, tag, hval) if \ + type(hval) in [type([]), type('')] else hval + ) + package_values.append(repodir_id) + package_values.append(pkg) + package_values.append(None) + data['packages'].append(package_values) + for table in db_struct['dep_tables']: + table_data = data[table] + rpref = 'RPMTAG_' + table[4 : -1].upper() # rpm_requires + (dep_name, dep_flags, dep_version) = \ + (hdr[rpref + 'NAME'], hdr[rpref + 'FLAGS'], hdr[rpref + 'VERSION']) + for i in xrange(0, len(hdr[rpref + 'NAME'])): + for build_arch in build_archs: + table_data.append([dep_name[i].decode('utf-8'), + dep_flags[i], + dep_version[i], + pkg_id, build_arch]) + (pkg_file_paths, pkg_file_names, pkg_file_sizes, pkg_file_modes) = \ + (hdr['RPMTAG_FILEPATHS'], hdr['RPMTAG_BASENAMES'], + hdr['RPMTAG_FILESIZES'], hdr['RPMTAG_FILEMODES']) + files_list = data['package_files'] + files_dirs = {} + obj_so_files_idx = [] + for i in xrange(0, len(pkg_file_paths)): + file_name = pkg_file_names[i] + file_path = pkg_file_paths[i] + pkg_file_id = generate_new_id(generator, gen_lock) + files_list.append([pkg_file_id, #FILE_REC_ID_IDX = 0 + pkg_id, + file_name.decode('utf-8'), + file_path.decode('utf-8'), #FILE_REC_PATH_IDX = 3 + pkg_file_sizes[i], + pkg_file_modes[i], + None, #link_to_path FILE_REC_LINK_IDX = 6 + None #mark FILE_REC_LINK_IDX = 7 + ]) + if pkg_file_modes[i] & RPMFILEMODE_DIRECTORY != 0: + files_dirs[file_path] = False + continue + dir_name = os.path.dirname(file_path) + if dir_name != '' and dir_name not in files_dirs: + files_dirs[dir_name] = True + if no_shared_objects: + continue + if os.path.splitext(file_name)[1] in \ + ['.debug', '.xz', '.conf', '.py', '.c', '.h', '.hpp', '.png', + '.cc', '.cpp', '.sh', '.java', '.pl', '.patch', '.desktop']: + continue + if file_path.startswith('/usr/lib/debug/.build-id') or \ + file_path.endswith('/ld.so.cache'): + continue + if re.search(r'\.so($|\.)', file_name) or \ + (pkg_file_modes[i] & RPMFILEMODE_EXECUTE) != 0: + obj_so_files_idx.append(len(files_list) - 1) + + for fdir in sorted(files_dirs.keys()): + if files_dirs[fdir]: + # Add parent directories as implicit files + # TODO: recursive processing? + pkg_file_id = generate_new_id(generator, gen_lock) + files_list.append([pkg_file_id, #FILE_REC_ID_IDX = 0 + pkg_id, + os.path.basename(fdir), + fdir, #FILE_REC_PATH_IDX = 3 + 0, + -1, # special mode + None, #link_to_path FILE_REC_LINK_IDX = 6 + None #mark FILE_REC_LINK_IDX = 7 + ]) + + if obj_so_files_idx: + pkg_temp_dir = os.path.join(temp_dir, os.path.basename(local_pkg)) + os.makedirs(pkg_temp_dir) + if extract_files(local_pkg, files_list, + obj_so_files_idx, pkg_temp_dir): + for i in obj_so_files_idx: + register_object(data, files_list[i], pkg_temp_dir, + no_so_symbols) + + shutil.rmtree(pkg_temp_dir, True) + + remove_cached_file(pkg) + queue_in.task_done() + + conn = sqlite3.connect(DB, timeout=30) + conn.executemany(""" +INSERT INTO packages (%s) VALUES (%s)""" % + (db_struct['packages_field_names'], + db_struct['packages_values_template']), + data['packages']) + + for table in db_struct['dep_tables']: + conn.executemany(""" +INSERT INTO %s (name, flags, version, package_id, build_arch) +VALUES (?, ?, ?, ?, ?)""" % table, data[table]) + + conn.executemany(""" +INSERT INTO package_files (id, package_id, basename, path, + size, mode, link_to_path, mark) +VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", data['package_files']) + + conn.executemany(""" +INSERT INTO so_needed(obj_file_id, name) VALUES(?, ?) +""", data['so_needed']) + + conn.executemany(""" +INSERT INTO obj_symbols(obj_file_id, name, sym_type) VALUES(?, ?, ?) +""", data['obj_symbols']) + + conn.commit() + queue_in.task_done() + +def get_files(url, ext): + filelist = [] + urlp = urlparse(url) + if urlp.scheme in ['ftp', 'http', 'https']: + return parse_index_html(wget_url(url, None), url, '.rpm') + dir_list = os.listdir(url) + for d in dir_list: + if d.endswith(ext): + filepath = os.path.normpath(os.path.join(url, d)) + filelist.append(filepath) + return filelist + +local_cache = {} +def get_local_file(url, temp_dir): + urlp = urlparse(url) + if urlp.scheme in ['ftp', 'http', 'https']: + cached_file_name = local_cache.get(url) + if cached_file_name and os.path.isfile(cached_file_name): + return cached_file_name + cache_dir = os.path.join(temp_dir, 'cache') + if not os.path.isdir(cache_dir): + os.makedirs(cache_dir) + temp_file = os.path.join(cache_dir, os.path.basename(url)) + wget_url(url, temp_file) + local_cache[url] = temp_file + return temp_file + return url + +def remove_cached_file(url): + cached_file_name = local_cache.get(url) + if cached_file_name: + os.unlink(cached_file_name) + del local_cache[url] + +def wget_url(url, target_file): + urlp = urlparse(url) + wget_params = [] + site = urlp.netloc + if urlp.username: + wget_params = wget_params + ['--auth-no-challenge', + '--http-user=%s' % urlp.username, + '--http-password=%s' % + ('""' if not urlp.password else urlp.password)] + site = site[site.find('@') + 1:] + url = urlunparse((urlp.scheme, site, urlp.path, urlp.params, + urlp.query, urlp.fragment)) + print 'Downloading %s...' % url + if target_file is None: + wget_params += ['-nv', '-O-', url] + else: + wget_params += ['-nv', '-O', target_file, url] + p = subprocess.Popen(['wget'] + wget_params, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + wget_out, wget_err = p.communicate() + if p.returncode != 0: + print >> sys.stderr, ('Unable to get data from the url: %s ' + '(error: %d).\n%s\n%s') % \ + (url, p.returncode, wget_out, wget_err) + raise Exception('Unable to download data (%d).' % p.returncode) + if target_file is None: + return wget_out + +def parse_index_html(index_html, base_url, filter_ext): + file_list = [] + for match in re.finditer(r'href="([^"]+)"', index_html, re.M): + filename = match.group(1) + if filename.endswith(filter_ext): + if '://' in filename[:8]: + file_list.append(filename) + continue + filepath = os.path.join(base_url, filename) + if os.path.dirname(filepath) == base_url.rstrip('/') and \ + os.path.basename(filepath) == filename: + file_list.append(filepath) + return file_list + +def download_repodir(source_urlp, cache_dir): + site = source_urlp.netloc + site = site[site.find('@') + 1:] + target_dir = os.path.join(cache_dir, + site, + source_urlp.path.lstrip('/')) + if not os.path.isdir(target_dir): + os.makedirs(target_dir) + remote_files = {} + if source_urlp.scheme in ['ftp', 'http', 'https']: + source_url = source_urlp.geturl() + remote_dir_contents = parse_index_html(wget_url(source_url, None), + source_url, '.rpm') + for remote_file in remote_dir_contents: + remote_filename = urllib.unquote(os.path.basename(remote_file)) + remote_files[remote_filename] = True + target_file = os.path.join(target_dir, remote_filename) + if os.path.isfile(target_file): + continue + wget_url(remote_file, target_file) + + for local_filename in os.listdir(target_dir): + if local_filename not in remote_files and \ + local_filename.endswith('.rpm'): + print 'Removing local file: %s.' % local_filename + os.unlink(os.path.join(target_dir, local_filename)) + + return target_dir + +def urpm_get_packages(media): + extra_params = [] + if not media.endswith(' update'): + extra_params = ['--exclude-media', media + ' update'] + p = subprocess.Popen(['urpmq', '-r', '--ignorearch', + '--list', '--media', media] + + extra_params, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + urpmqr_out, urpmqr_err = p.communicate() + if p.returncode != 0 or len(urpmqr_err) > 0: + print >> sys.stderr, ('Unable to get a list of packages ' + 'from the media: %s.\n' + '%s\n%s') % (media, urpmqr_out, urpmq_err) + raise Exception('Unable to get a list of packages (%d).' % p.returncode) +# urpmi --no-install --allow-nodeps --force +# --download-all=/tmp/ xine-wavpack-1.2.4-1plf --media Desktop2012.1-8 + p = subprocess.Popen(['urpmq', '-f', '--ignorearch', + '--list', '--media', media] + + extra_params, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + urpmqf_out, urpmqf_err = p.communicate() + if p.returncode != 0 or len(urpmqf_err) > 0: + print >> sys.stderr, ('Unable to get a list of packages ' + 'from the media: %s.\n' + '%s\n%s') % (media, urpmqf_out, urpmqf_err) + raise Exception('Unable to get a list of packages (%d).' % p.returncode) + + rpm_list = [] + qr_lines = urpmqr_out.split('\n') + qf_lines = urpmqf_out.split('\n') + if len(qr_lines) != len(qf_lines): + raise Exception('Not consistent urpmq -r and urpmq -f outputs ' + '(%d and %d lines).' % (len(qr_lines), len(qf_lines))) + for i in xrange(0, len(qf_lines)): + qf_line = qf_lines[i] + if qf_line.strip() == '': + continue + if not qf_line.startswith(qr_lines[i]): + raise Exception('Not consistent urpmq -r and urpmq -f outputs: ' + '%s and %s' % (qr_lines[i], qf_line)) + rpm_list.append('urpm://%s/%s.rpm#%s' % (urllib.quote(media), + urllib.quote(qf_line), + urllib.quote(qr_lines[i]))) + return rpm_list + + +def get_urpmi(urpm_package, target_dir): + urlp = urlparse(urpm_package) + package_name = urllib.unquote(urlp.fragment) + print package_name + p = subprocess.Popen(['urpmi', '--no-install', + '--force', '--no-suggests', + '--allow-nodeps', + '--no-download-all', + '--media', urlp.netloc, + package_name]) + #stdout=subprocess.PIPE, + #stderr=subprocess.PIPE) + urpmi_out, urpmi_err = p.communicate() + if p.returncode != 0: + print >> sys.stderr, ('Unable to get the package %s ' + 'from the media %s.\n' + '%s\n%s') % ( + package_name, urlp.netloc, + urpmi_out, urpmi_err) + raise Exception('Unable to get the package %s (%d).' % + (package_name, p.returncode)) + + +def urpm_get_repodir(repodir_name, cache_dir): + target_dir = os.path.join(cache_dir, + repodir_name, + 'rpms') + if not os.path.isdir(target_dir): + os.makedirs(target_dir) + urpm_files = {} + urpm_media_contents = urpm_get_packages(repodir_name) + for urpm_package in urpm_media_contents: + remote_filename = urllib.unquote(os.path.basename(urpm_package)) + target_file = os.path.join(target_dir, remote_filename) + get_urpmi(urpm_package, os.path.join(cache_dir, + repodir_name)) + print target_file + raise Exception('Not implemented.') + +generator_value = 0 + +def import_repodir(repodir_id, cache_dir, build_archs, conn, + db_struct, temp_dir, no_shared_objects, no_so_symbols): + + rd_rec = conn.execute("""SELECT name, url FROM repodirs WHERE id = ?""", + [repodir_id]).fetchone() + (repodir_name, repodir_url) = (rd_rec[0], rd_rec[1]) + urlp = urlparse(repodir_url) + working_url = repodir_url + if cache_dir is not None: + if urlp.scheme in ['ftp', 'http', 'https']: + working_url = download_repodir(urlp, cache_dir) + elif urlp.scheme == 'urpm': + working_url = urpm_get_repodir(repodir_name, cache_dir) + elif urlp.scheme not in ['', 'file']: + raise Exception('Invalid scheme in the repository url: %s' % + repodir_url) + rpm_list = [] + rpm_list = get_files(working_url, '.rpm') + if not rpm_list: + return + print urlp.netloc[urlp.netloc.find('@') + 1:] + urlp.path, ': ', \ + len(rpm_list) + if not db_struct.get('defined'): + rpm_ts = rpm.TransactionSet() + rpm_ts.setVSFlags(~(rpm.RPMVSF_NEEDPAYLOAD)) + # ts.setVSFlags(~(rpm.RPMVSF_NOMD5|rpm.RPMVSF_NEEDPAYLOAD)) + hdr = get_rpm_header(rpm_ts, get_local_file(rpm_list[0], temp_dir)) + + # Retain sort order! + packages_extra_fields = {'repodir_id': 'INTEGER', + 'rpm_url': 'TEXT', + 'sourcerpm_package': 'TEXT'} + + file_tags_re = r'^RPMTAG_(BASENAMES|FILE[\w\d]+)' + dir_tags_re = r'^RPMTAG_DIR(INDEXES|NAMES)' + changelog_tags_re = r'^RPMTAG_CHANGELOG\w+' + trigger_tags_re = r'^RPMTAG_TRIGGER\w+' + + datetime_tags = ['RPMTAG_PACKAGETIME', 'RPMTAG_RPMLIBTIMESTAMP', ] + db_struct['blob_tags'] = ['RPMTAG_RSAHEADER', 'RPMTAG_DSAHEADER', + 'RPMTAG_HEADERIMMUTABLE', 'RPMTAG_SIGMD5', + 'RPMTAG_PKGID', 'RPMTAG_SOURCEPKGID'] + + reserved_field_names = ['id', 'group'] + skip_tags_re = '^RPMTAG_(C|D|E|N|P|R|V|HEADERIMMUTABLE)$' + #C - CONFLICTNAME, D - DISTEPOCH, E - EPOCH, N - NAME, O - OBSOLETENAME + #P - PROVIDENAME, R - RELEASE, V - VERSION + + types = {"" : "TEXT", "": "INTEGER", + "": "TEXT", "": "TEXT"} + + dep_tags_re = r'^RPMTAG_(CONFLICT|OBSOLETE|PROVIDE|REQUIRE)\w+' + + db_struct['dep_tables'] = ['rpm_requires', 'rpm_provides', + 'rpm_conflicts', 'rpm_obsoletes'] + + packages_field_names = 'id, ' + packages_values_template = '?,' + packages_tags = [] + packages_fields = '' + + rpmtags = [str(t) for t in dir(rpm) if t.startswith('RPMTAG_') ] + for tag in rpmtags: + if (re.match(file_tags_re, tag) or re.match(dir_tags_re, tag) or + re.match(changelog_tags_re, tag) or + re.match(skip_tags_re, tag) or + re.match(trigger_tags_re, tag) or + re.match(dep_tags_re, tag)): + continue + sqltype = "TIMESTAMP" if tag in datetime_tags else \ + "BLOB" if tag in db_struct['blob_tags'] else \ + types[str(type(hdr[tag]))] + fieldname = tag.replace('RPMTAG_', '').lower() + if fieldname in reserved_field_names: + fieldname = 'rpm_' + fieldname + packages_tags.append(tag) + packages_field_names += fieldname + ', ' + packages_values_template += '?, ' + packages_fields += fieldname + ' ' + sqltype + ', ' + nef = 0 + for extra_field in sorted(packages_extra_fields.keys()): + packages_field_names += (', ' if nef > 0 else '') + extra_field + packages_values_template += (', ' if nef > 0 else '') + '?' + packages_fields += (', ' if nef > 0 else '') + extra_field + ' ' + \ + packages_extra_fields[extra_field] + nef += 1 + conn.execute(""" +CREATE TABLE IF NOT EXISTS packages(id INTEGER PRIMARY KEY NOT NULL, %s) +""" % (packages_fields)) + for table in db_struct['dep_tables']: + conn.execute(""" +CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY NOT NULL, + name TEXT, flags INTEGER, version TEXT, build_arch TEXT, + package_id INTEGER NOT NULL)""" % (table)) + conn.commit() + db_struct['packages_tags'] = packages_tags + db_struct['packages_field_names'] = packages_field_names + db_struct['packages_values_template'] = packages_values_template + db_struct['defined'] = True + + + queue_in = mp.JoinableQueue() + for pkg in rpm_list: + queue_in.put((pkg, )) + + for i in xrange(NUM_PROCESSES): + queue_in.put(None) + + # Trying to prevent Exception AssertionError: AssertionError() in + # ignored + gc.collect() + time.sleep(1) + gc.disable() + global generator_value + id_generator = mp.Value('i', generator_value) + generator_lock = mp.Lock() + # run workers + workers = [] + for i in xrange(NUM_PROCESSES): + worker = mp.Process(target = process_package_worker, + args = (i, queue_in, id_generator, + generator_lock, db_struct, + repodir_id, build_archs, temp_dir, + no_shared_objects, no_so_symbols)) + workers.append(worker) + worker.start() + queue_in.join() + gc.enable() + generator_value = id_generator.value + +def add_repodir(xrepodir, conn): + dbc = conn.cursor() + dbc.execute(""" +INSERT INTO repodirs (name, url, sources) VALUES (?, ?, ?) +""", [xrepodir.get('name'), xrepodir.get('url'), xrepodir.get('sources')]) + repodir_id = dbc.lastrowid + for depend in xrepodir.findall('dependency'): + dbc.execute(""" +INSERT INTO repodir_depends(repodir_id, depend_repodir_name) VALUES (?, ?) +""", [repodir_id, depend.text.strip()]) + conn.commit() + return repodir_id + +def get_build_archs(xrepodir, xrepodirs): + build_archs = [] + for depend in xrepodir.findall('dependency'): + arch_sign = '$arch' + depend_repo = depend.text.strip() + spos = depend_repo.find(arch_sign) + if spos >= 0: + drepo_prefix = depend_repo[:spos] + drepo_postfix = depend_repo[spos + len(arch_sign):] + for xrepodir in xrepodirs.findall('dir'): + repodir_name = xrepodir.get('name') + if repodir_name.startswith(drepo_prefix) and \ + repodir_name.endswith(drepo_postfix): + repo_arch = repodir_name[len(drepo_prefix) : + len(repodir_name) - len(drepo_postfix)] + if repo_arch == 'SRPMS': + continue + if repo_arch not in build_archs: + build_archs.append(repo_arch) + if build_archs: + return build_archs + return [None] + +def import_repositories(options, conn): + init_database(conn) + + rpm_db_struct = {} + tree = ET.parse(options.config, parser=ET.XMLParser()) + config_root = tree.getroot() + + temp_dir = '/dev/shm/rt-tmp/' + shutil.rmtree(temp_dir, True) + os.mkdir(temp_dir) + + for xrepodir in config_root.find('repositories').findall('dir'): + repodir_id = add_repodir(xrepodir, conn) + build_archs = [None] if xrepodir.get('sources') != '.' else \ + get_build_archs(xrepodir, + config_root.find('repositories')) + import_repodir(repodir_id, options.cache_dir, + build_archs, conn, rpm_db_struct, temp_dir, + options.no_shared_objects, options.no_so_symbols) + + shutil.rmtree(temp_dir, True) + if not rpm_db_struct.get('defined'): + print 'Database was not initialized ' \ + '(check whether repositories are empty).' + return False + index_database(conn) + return True + +################################################ +### Post-process repo.db after data import +################################################ def version_ok(required_version, compare_flag, candidate_version): def sep_version(version): @@ -148,8 +920,10 @@ def version_ok(required_version, compare_flag, candidate_version): return False rpm_cmp_res = rpm_cmp_versions(candidate_version, required_version) #if (cmp_res != rpm_cmp_res): - #print >> sys.stderr, ('Invalid compare: "%s" vs "%s"! Results: rc: %d, rpm: %d.' % - #(candidate_version, required_version, cmp_res, rpm_cmp_res)) + # print >> sys.stderr, (('Invalid compare: "%s" vs "%s"! ' + # 'Results: rc: %d, rpm: %d.') % + # (candidate_version, required_version, + # cmp_res, rpm_cmp_res)) if compare_flag == RPMSENSE_EQUAL: return cmp_res == 0 elif compare_flag == RPMSENSE_LESS | RPMSENSE_EQUAL: @@ -163,8 +937,9 @@ def version_ok(required_version, compare_flag, candidate_version): return False -def process_repodir_dependencies(dbc, repodir_id, repodir_name, repodir_depends, dep_type): - package_depends = dbc.execute(""" +def process_repodir_dependencies(conn, repodir_id, repodir_name, + repodir_depends, dep_type): + package_depends = conn.execute(""" SELECT packages.id AS package_id, packages.name AS package_name, packages.nvra, dep.id, dep.name, flags, dep.version FROM packages, rpm_%s dep @@ -178,44 +953,40 @@ SELECT packages.id AS package_id, packages.name AS package_name, packages.nvra, for packdep in package_depends: (cpackage_id, package_nvra, dep_id, dep_name, dep_flags, dep_version) = \ (packdep[0], packdep[2], packdep[3], packdep[4], packdep[5], packdep[6]) - dependency_uid = dep_name + '\x00' + str(dep_flags) + '\x00' + dep_version - dep_res = dependency_cache.get(dependency_uid, None) - if dep_res is None: - dep_res = [] - depend_candidates = dbc.execute(""" + dep_res = [] + depend_candidates = conn.execute(""" SELECT packages.id AS package_id, packages.name AS package_name, packages.nvra, prov.id, prov.name, flags, prov.version FROM packages, rpm_provides AS prov WHERE prov.package_id = packages.id AND repodir_id IN (%s) AND prov.name = ? ORDER by packages.name, packages.nvra """ % in_repodirs, [dep_name]).fetchall() - for dep_cand in depend_candidates: - (pkg_id, provides_id, provides_flags, provides_version) = \ - (dep_cand[0], dep_cand[3], dep_cand[5], dep_cand[6]) - if provides_flags & RPMSENSE_SENSEMASK == 0: - if not provides_version: - provides_version = '*' - else: - raise Exception('Invalid provides version ' - '(flags = %d, version = %s)!' % - (provides_flags, provides_version)) - if version_ok(dep_version, dep_flags & RPMSENSE_SENSEMASK, - provides_version): - dep_res.append({'prov_id': provides_id, 'pkg_id': pkg_id}) + for dep_cand in depend_candidates: + (pkg_id, provides_id, provides_flags, provides_version) = \ + (dep_cand[0], dep_cand[3], dep_cand[5], dep_cand[6]) + if provides_flags & RPMSENSE_SENSEMASK == 0: + if not provides_version: + provides_version = '*' + else: + raise Exception('Invalid provides version ' + '(flags = %d, version = %s)!' % + (provides_flags, provides_version)) + if version_ok(dep_version, dep_flags & RPMSENSE_SENSEMASK, + provides_version): + dep_res.append({'prov_id': provides_id, 'pkg_id': pkg_id}) if len(dep_res) > 0: for res_rec in dep_res: - dbc.execute(""" + conn.execute(""" INSERT INTO package_%(dep)s_res(package_id, %(dep)s_id, provides_id, dep_package_id) VALUES (?, ?, ?, ?)""" % {'dep': dep_type}, [cpackage_id, dep_id, res_rec.get('prov_id'), res_rec.get('pkg_id')]) - dependency_cache[dependency_uid] = dep_res -def process_repodir_requires(dbc, repodir_id, repodir_name, repodir_depends, requires_build_arch): - global n +def process_repodir_requires(conn, repodir_id, repodir_name, + repodir_depends, requires_build_arch): print 'Processing repo %d: %s (with depends: %s)' % (repodir_id, repodir_name, str(repodir_depends)) - package_requires = dbc.execute(""" + package_requires = conn.execute(""" SELECT packages.id AS package_id, packages.name AS package_name, packages.nvra, req.id, req.name, flags, req.version FROM packages, rpm_requires req @@ -240,7 +1011,7 @@ SELECT packages.id AS package_id, packages.name AS package_name, packages.nvra, # see if($N=~/\A(rpmlib|executable)\(.+\)\Z/) in urpm_repoclosure.pl req_res.append({}) else: - depend_candidates = dbc.execute(""" + depend_candidates = conn.execute(""" SELECT packages.id AS package_id, packages.name AS package_name, packages.nvra, prov.id, prov.name, flags, prov.version FROM packages, rpm_provides AS prov @@ -293,14 +1064,13 @@ SELECT packages.id AS package_id, packages.name AS package_name, packages.nvra, if (requires_flags & (RPMSENSE_SCRIPT_POST | RPMSENSE_SCRIPT_PREUN | RPMSENSE_SCRIPT_POSTUN)) != 0: - int_files_cnt = dbc.execute(""" + int_files_cnt = conn.execute(""" SELECT COUNT(1) FROM package_files WHERE package_id = ? AND path = ? """, [cpackage_id, requires_name]).fetchone() if int_files_cnt[0] > 0: req_res.append({}) else: - #TODO: Check file dependencies (/usr/bin/python (required by ant-scripts-1.7.1-7.0.6.noarch), /usr/sbin/useradd (required by tomcat5-5.5.28-0.5.2.noarch))? - files_deps = dbc.execute(""" + files_deps = conn.execute(""" SELECT package_id FROM package_files WHERE path = ? AND package_id in (SELECT id FROM packages WHERE repodir_id IN (%s)) @@ -313,7 +1083,7 @@ SELECT package_id FROM package_files if len(req_res) > 0: for res_rec in req_res: - dbc.execute(""" + conn.execute(""" INSERT INTO package_requires_res(package_id, requires_id, provides_id, dep_package_id) VALUES (?, ?, ?, ?) @@ -321,27 +1091,11 @@ VALUES (?, ?, ?, ?) else: print requires_name, ' ', requires_version, ' (required by %s)' % package_nvra, ' not found!!!' broken_dep += 1 - n = n + 1 - #print "n = ", n -# if n == 60000: -# break print 'broken_deps: ', broken_dep print '' -def extract_arch(arch_template, repo_name): - arch_sign = '$arch' - spos = arch_template.find(arch_sign) - if spos >= 0: - repo_prefix = arch_template[:spos] - repo_postfix = arch_template[spos + len(arch_sign):] - if repo_name.startswith(repo_prefix) and \ - repo_name.endswith(repo_postfix): - return repo_name[len(repo_prefix) : - len(repo_name) - len(repo_postfix)] - return None - -def process_repodir_file_links(dbc, repodir_id, repodir_name, repodir_depends): - package_files_links = dbc.execute(""" +def process_repodir_file_links(conn, repodir_id, repodir_name, repodir_depends): + package_files_links = conn.execute(""" SELECT packages.id AS package_id, packages.name AS package_name, packages.nvra, package_files.id AS object_id, package_files.path, package_files.link_to_path FROM packages, package_files @@ -359,7 +1113,7 @@ SELECT packages.id AS package_id, packages.name AS package_name, packages.nvra, target_paths[target_path] = True while target_path != '': new_target_path = None - tofile = dbc.execute(""" + tofile = conn.execute(""" SELECT id, link_to_path FROM package_files WHERE path = ? AND package_id = ? """, [target_path, pkg_id]).fetchone() if tofile: @@ -367,7 +1121,7 @@ SELECT id, link_to_path FROM package_files WHERE path = ? AND package_id = ? new_target_path = tofile[1] if not target_obj_id: # Just two levels of dependency recursion - TODO: Full depth recursion? - tofile = dbc.execute(""" + tofile = conn.execute(""" SELECT id, link_to_path FROM package_files WHERE path = ? AND package_id IN ( SELECT dep_package_id FROM package_requires_res WHERE package_id = ? UNION @@ -388,16 +1142,17 @@ SELECT id, link_to_path FROM package_files WHERE path = ? AND package_id IN ( target_paths[target_path] = True if target_obj_id: - dbc.execute(""" + conn.execute(""" UPDATE package_files SET link_to_file_id = ? WHERE id = ? """, [target_obj_id, object_id]) else: # print 'target %s not found (%d: %s)' % (target_path, pkg_id, pkg_name) pass -def process_repodir_so_needed(dbc, repodir_id, repodir_name, repodir_depends): + +def process_repodir_so_needed(conn, repodir_id, repodir_name, repodir_depends): print 'Searching object files resolutions (1)...' - dbc.execute(""" + conn.execute(""" INSERT INTO so_needed_res(so_needed_id, dep_obj_file_id, res_type) SELECT so_needed.id, tpf.id, 1 FROM packages CROSS JOIN package_files spf CROSS JOIN so_needed CROSS JOIN rpm_requires @@ -416,7 +1171,7 @@ INSERT INTO so_needed_res(so_needed_id, dep_obj_file_id, res_type) search_repodirs.extend(repodir_depends) in_repodirs = ','.join(str(id) for id in search_repodirs) - objects_not_resolved1 = dbc.execute(""" + objects_not_resolved1 = conn.execute(""" SELECT packages.id AS package_id, packages.nvra, package_files.id AS object_id, package_files.basename AS object_name, so_needed.id AS so_needed_id, so_needed.name AS so_needed_name @@ -431,14 +1186,14 @@ SELECT packages.id AS package_id, packages.nvra, if objects_not_resolved1: print 'Searching object files resolutions (2)...' in_so_needed = ','.join(str(obj_rec[4]) for obj_rec in objects_not_resolved1) - dbc.execute(""" + conn.execute(""" INSERT INTO so_needed_res(so_needed_id, dep_obj_file_id, res_type) SELECT so_needed.id, tpf.id, 2 FROM packages, package_files tpf, so_needed WHERE packages.repodir_id IN (%s) AND packages.id = tpf.package_id AND so_needed.id IN (%s) AND tpf.basename = so_needed.name """ % (in_repodirs, in_so_needed)) - objects_not_resolved2 = dbc.execute(""" + objects_not_resolved2 = conn.execute(""" SELECT packages.id AS package_id, packages.nvra, package_files.id AS object_id, package_files.basename AS object_name, so_needed.id AS so_needed_id, so_needed.name AS so_needed_name @@ -451,10 +1206,10 @@ SELECT packages.id AS package_id, packages.nvra, print 'Object files not resolved: ', len(objects_not_resolved2) -def process_repodir_obj_symbols(dbc, repodir_id, repodir_name, repodir_depends): +def process_repodir_obj_symbols(conn, repodir_id, repodir_name, repodir_depends): print 'Searching symbols resolutions (1)...' # EXPLAIN QUERY PLAN - dbc.execute(""" + conn.execute(""" INSERT INTO obj_symbols_res(obj_sym_id, dep_obj_sym_id, res_type) SELECT sos.id, tos.id, 1 FROM packages CROSS JOIN package_files spf CROSS JOIN obj_symbols sos CROSS JOIN so_needed CROSS JOIN so_needed_res CROSS JOIN package_files tpf CROSS JOIN obj_symbols tos @@ -465,7 +1220,7 @@ SELECT sos.id, tos.id, 1 FROM packages CROSS JOIN package_files spf CROSS JOIN o tos.sym_type = 1 AND tos.name = sos.name """, [repodir_id]) print 'Searching symbols resolutions (2)...' - dbc.execute(""" + conn.execute(""" INSERT INTO obj_symbols_res(obj_sym_id, dep_obj_sym_id, res_type) SELECT sos.id, tos.id, 2 FROM packages CROSS JOIN package_files CROSS JOIN obj_symbols sos CROSS JOIN so_needed CROSS JOIN so_needed_res CROSS JOIN obj_symbols tos @@ -478,7 +1233,7 @@ SELECT sos.id, tos.id, 2 FROM packages CROSS JOIN package_files CROSS JOIN obj_s search_repodirs = [repodir_id] search_repodirs.extend(repodir_depends) in_repodirs = ','.join(str(id) for id in search_repodirs) - dbc.execute(""" + conn.execute(""" INSERT INTO obj_symbols_res(obj_sym_id, dep_obj_sym_id, res_type) SELECT sos.id, tos.id, 3 FROM packages CROSS JOIN package_files CROSS JOIN obj_symbols sos CROSS JOIN obj_symbols tos CROSS JOIN package_files tpf @@ -489,13 +1244,27 @@ SELECT sos.id, tos.id, 3 FROM packages CROSS JOIN package_files CROSS JOIN obj_s """ % in_repodirs, [repodir_id]) -def process_repodir(dbc, repo_id, repo_name, repo_sources, depend_repodir_list, repodirs_processed, dep_arch): +def process_repodir_sources(conn, repodir_id, repo_sources): + if not repo_sources: + return + print 'Searching source rpms...' + conn.execute(""" +UPDATE packages SET sourcerpm_package = + (SELECT id FROM packages ps + WHERE repodir_id IN (SELECT id FROM repodirs WHERE name = ?) AND + ps.nvra = substr(packages.sourcerpm, 1, length(packages.sourcerpm) - 4) + ) + WHERE repodir_id = ? AND sourcerpm LIKE '%.rpm' +""", [repo_sources, repodir_id]) + +def process_repodir(conn, repo_id, repo_name, repo_sources, + depend_repodir_list, repodirs_processed, dep_arch): all_depends_ready = True repodir_depends = [] in_repodirs = ','.join(str(id) for id in repodirs_processed) for dr_name in depend_repodir_list: - repodir_depend_found = dbc.execute(""" + repodir_depend_found = conn.execute(""" SELECT id, name FROM repodirs WHERE id IN (%s) AND name = ? """ % in_repodirs, [dr_name]).fetchall() if len(repodir_depend_found) == 0: @@ -507,34 +1276,30 @@ SELECT id, name FROM repodirs WHERE id IN (%s) AND name = ? if not all_depends_ready: return False print repo_name, ' ', depend_repodir_list, ' ', dep_arch - process_repodir_dependencies(dbc, repo_id, repo_name, repodir_depends, 'conflicts') - process_repodir_dependencies(dbc, repo_id, repo_name, repodir_depends, 'obsoletes') - process_repodir_requires(dbc, repo_id, repo_name, repodir_depends, dep_arch) - process_repodir_file_links(dbc, repo_id, repo_name, repodir_depends) - process_repodir_so_needed(dbc, repo_id, repo_name, repodir_depends) - process_repodir_obj_symbols(dbc, repo_id, repo_name, repodir_depends) - - if repo_sources: - print 'Searching source rpms...' - dbc.execute(""" -UPDATE packages SET sourcerpm_package = - (SELECT id FROM packages ps - WHERE repodir_id IN (SELECT id FROM repodirs WHERE name = ?) AND - ps.nvra = substr(packages.sourcerpm, 1, length(packages.sourcerpm)-4) - ) - WHERE repodir_id = ? AND sourcerpm LIKE '%.rpm' -""", [repo_sources, repo_id]) + process_repodir_dependencies(conn, repo_id, repo_name, repodir_depends, 'conflicts') + process_repodir_dependencies(conn, repo_id, repo_name, repodir_depends, 'obsoletes') + process_repodir_requires(conn, repo_id, repo_name, repodir_depends, dep_arch) + process_repodir_file_links(conn, repo_id, repo_name, repodir_depends) + process_repodir_so_needed(conn, repo_id, repo_name, repodir_depends) + process_repodir_obj_symbols(conn, repo_id, repo_name, repodir_depends) + process_repodir_sources(conn, repo_id, repo_sources) return True -def main(args): +def process_repodb(conn): - conn = sqlite3.connect(DB) - dbc = conn.cursor() + def extract_arch(arch_template, repo_name): + arch_sign = '$arch' + spos = arch_template.find(arch_sign) + if spos >= 0: + repo_prefix = arch_template[:spos] + repo_postfix = arch_template[spos + len(arch_sign):] + if repo_name.startswith(repo_prefix) and \ + repo_name.endswith(repo_postfix): + return repo_name[len(repo_prefix) : + len(repo_name) - len(repo_postfix)] + return None - global n - n = 0 - dbc.executescript(""" -PRAGMA cache_size = -1048576; + conn.executescript(""" DELETE FROM package_requires_res; DELETE FROM package_conflicts_res; DELETE FROM package_obsoletes_res; @@ -549,37 +1314,40 @@ ANALYZE; repodirs_processed_cnt = -1 while repodirs_processed_cnt < len(repodirs_processed): in_repodirs = ','.join(str(id) for id in repodirs_processed) - repodirs = dbc.execute(""" -SELECT id, name, sources, path FROM repodirs WHERE sources <> '.' AND id NOT IN (%s) + repodirs = conn.execute(""" +SELECT id, name, sources FROM repodirs WHERE sources <> '.' AND id NOT IN (%s) """ % in_repodirs).fetchall() for repodir in repodirs: - (repo_id, repo_name, repo_sources) = (repodir[0], repodir[1], repodir[2]) - depend_repodir_names = dbc.execute( + (repodir_id, repodir_name, repodir_sources) = \ + (repodir[0], repodir[1], repodir[2]) + depend_repodir_names = conn.execute( """ SELECT depend_repodir_name FROM repodir_depends WHERE repodir_id = ? -""", [repo_id]).fetchall() +""", [repodir_id]).fetchall() depend_repodir_list = [drn[0] for drn in depend_repodir_names] - if process_repodir(dbc, repo_id, repo_name, repo_sources, depend_repodir_list, repodirs_processed, None): - repodirs_processed.append(repo_id) + if process_repodir(conn, repodir_id, repodir_name, repodir_sources, + depend_repodir_list, repodirs_processed, None): + repodirs_processed.append(repodir_id) repodirs_processed_cnt = len(repodirs_processed) #Process SRPMS repodirs_processed_cnt = -1 while repodirs_processed_cnt < len(repodirs_processed): - repodirs = dbc.execute(""" -SELECT id, name, sources, path FROM repodirs WHERE sources = '.' + repodirs = conn.execute(""" +SELECT id, name, sources FROM repodirs WHERE sources = '.' """).fetchall() for repodir in repodirs: - (repo_id, repo_name, repo_sources) = (repodir[0], repodir[1], repodir[2]) + (repodir_id, repodir_name, repodir_sources) = \ + (repodir[0], repodir[1], repodir[2]) src_build_archs = [] - depend_repodir_names = dbc.execute( + depend_repodir_names = conn.execute( """ SELECT depend_repodir_name FROM repodir_depends WHERE repodir_id = ? -""", [repo_id]).fetchall() +""", [repodir_id]).fetchall() for drn in depend_repodir_names: dr_name = drn[0] if '$arch' in dr_name: - depend_repodir_found = dbc.execute( + depend_repodir_found = conn.execute( """ SELECT id, name FROM repodirs WHERE name LIKE ? """, [dr_name.replace('$arch', '%')]).fetchall() @@ -592,27 +1360,50 @@ SELECT id, name FROM repodirs WHERE name LIKE ? continue src_build_archs.append(arch) else: - raise Exception('Source repository should depend on */$arch/* repo.') + raise Exception('Source repository should depend ' + 'on */$arch/* repo.') for arch in src_build_archs: depend_repodir_list = [drn[0].replace('$arch', arch) for drn in depend_repodir_names] - if not process_repodir(dbc, repo_id, repo_name, None, depend_repodir_list, repodirs_processed, arch): + if not process_repodir(conn, repodir_id, repodir_name, None, + depend_repodir_list, repodirs_processed, + arch): raise Exception('Couldn\'t process SRPMS repository!') - repodirs_processed.append(repo_id) + repodirs_processed.append(repodir_id) repodirs_processed_cnt = len(repodirs_processed) in_repodirs = ','.join(str(id) for id in repodirs_processed) - repodirs_not_processed = dbc.execute(""" -SELECT id, name, sources, path FROM repodirs rd WHERE id NOT IN (%s) + repodirs_not_processed = conn.execute(""" +SELECT id, name, sources FROM repodirs rd WHERE id NOT IN (%s) """ % in_repodirs).fetchall() if len(repodirs_not_processed) > 0: print 'Repodirs not processed due to dependencies:' for rdna in repodirs_not_processed: print rdna[1] - dbc.execute(""" + conn.execute(""" ANALYZE""") conn.commit() +def main(args): + global NUM_PROCESSES + + if hasattr(os, "sysconf"): + if os.sysconf_names.has_key("SC_NPROCESSORS_ONLN"): + nproc = os.sysconf("SC_NPROCESSORS_ONLN") + if isinstance(nproc, int) and nproc > 0: + NUM_PROCESSES = nproc + + options = parse_args() + + if os.path.exists(DB): + os.unlink(DB) + conn = sqlite3.connect(DB) + if import_repositories(options, conn): + process_repodb(conn) + else: + os.unlink(DB) + + if __name__ == "__main__": main(sys.argv)