From 3712dda39cca401c93281ced76ae6a510b68b1ec Mon Sep 17 00:00:00 2001 From: Alexander Lakhin Date: Fri, 7 Feb 2014 13:53:14 +0400 Subject: [PATCH] Allow for partial rpm's analysis --- README | 3 ++- analyze-repodb.py | 36 ++++++++++++++++++++++++------------ fill-repodb.py | 32 +++++++++++++++++++++++--------- 3 files changed, 49 insertions(+), 22 deletions(-) diff --git a/README b/README index ada8c79..5b31b5a 100644 --- a/README +++ b/README @@ -3,7 +3,8 @@ 1. Настроить структуру репозиториев и пути в repo-analyze-config.xml 2. Заполнить базу данных информацией из репозиториев: fill-repodb.py repo-analyze-config.xml -* Скрипт создаёт в текущем каталоге базу данных repo.db размером около 1 Гб +* Для ускорения можно отключить обаботку .so и их символов ключами -O -S соответственно. +** Скрипт создаёт в текущем каталоге базу данных repo.db размером около 2 Гб (при полной обработке). 3. Подготовить базу данных к анализу: prepare-repodb.py 4. Выполнить анализ/проверки: diff --git a/analyze-repodb.py b/analyze-repodb.py index 4fc892c..ebdb8ad 100755 --- a/analyze-repodb.py +++ b/analyze-repodb.py @@ -232,8 +232,8 @@ SELECT packages.id, packages.name, nvra print 'Total: %d' % len(singles) def detect_lost_object_files(dbc): - print '===' - print 'Lost object (executable) files (provided but not found):' + header = '===\n' \ + 'Lost object (executable) files (provided but not found):' repodirs = dbc.execute(""" SELECT id, name, sources, path FROM repodirs ORDER BY id """).fetchall() @@ -246,14 +246,17 @@ SELECT nvra, package_files.path, mark ORDER BY packages.name, package_files.path """, [rd_id]).fetchall() if lost_object_files: + if header: + print header + header = None print '%d) %s' % (rd_id, rd_name) for lof in lost_object_files: print '\t%s: %s' % (lof[0], lof[1]) print 'Total: %d' % len(lost_object_files) def detect_broken_object_links(dbc): - print '===' - print 'Invalid object (executable) file links:' + header = '===\n' \ + 'Invalid object (executable) file links:' repodirs = dbc.execute(""" SELECT id, name, sources, path FROM repodirs ORDER BY id """).fetchall() @@ -267,6 +270,9 @@ SELECT nvra, package_files.path, link_to_path, mark ORDER BY packages.name, package_files.path """, [rd_id]).fetchall() if broken_object_links: + if header: + print header + header = None print '%d) %s' % (rd_id, rd_name) for bol in broken_object_links: print '\t%s: %s -/-> %s' % \ @@ -279,7 +285,7 @@ SELECT depend_repodir_name FROM repodir_depends WHERE repodir_id = ? """, [repodir_id]).fetchall() return ', '.join([dep_repo[0] for dep_repo in dep_repos]) -def detect_so_needed_not_found(dbc): +def detect_so_needed_not_resolved(dbc): repodirs = dbc.execute(""" SELECT id, name, sources, path FROM repodirs ORDER BY id """).fetchall() @@ -305,8 +311,8 @@ SELECT COUNT(1) FROM packages CROSS JOIN package_files CROSS JOIN so_needed CROS """, [rd_id]).fetchone() print '%d) %s: %d' % (rd_id, rd_name, objects_needed_resolved2[0]) - print '===' - print 'Objects needed but not resolved:' + header = '===' \ + 'Objects needed but not resolved:' for repodir in repodirs: (rd_id, rd_name) = (repodir[0], repodir[1]) objects_needed_not_resolved = dbc.execute(""" @@ -318,6 +324,9 @@ SELECT packages.nvra, package_files.path, so_needed.name """, [rd_id]).fetchall() if objects_needed_not_resolved: repodir_depends = get_repodir_depends(dbc, rd_id) + if header: + print header + header = None print ('%d) %s' % (rd_id, rd_name)) + \ ('' if repodir_depends == '' else (' (depends on: %s)' % repodir_depends)) @@ -325,7 +334,7 @@ SELECT packages.nvra, package_files.path, so_needed.name print '\t%s: %s -?-> %s' % (obj_nr[0], obj_nr[1], obj_nr[2]) print 'Total: %d' % len(objects_needed_not_resolved) -def detect_symbols_not_found(dbc): +def detect_symbols_not_resolved(dbc): repodirs = dbc.execute(""" SELECT id, name, sources, path FROM repodirs ORDER BY id """).fetchall() @@ -355,8 +364,8 @@ SELECT COUNT(1) FROM packages CROSS JOIN package_files CROSS JOIN obj_symbols CR """, [rd_id]).fetchone() print '%d) %s: %d' % (rd_id, rd_name, symbols_resolved3[0]) - print '===' - print 'Symbols not resolved:' + header = '===' \ + 'Symbols not resolved:' for repodir in repodirs: (rd_id, rd_name) = (repodir[0], repodir[1]) symbols_not_resolved = dbc.execute(""" @@ -368,6 +377,9 @@ SELECT packages.nvra, package_files.path, obj_symbols.name """, [rd_id]).fetchall() if symbols_not_resolved: repodir_depends = get_repodir_depends(dbc, rd_id) + if header: + print header + header = None print ('%d) %s' % (rd_id, rd_name)) + \ ('' if repodir_depends == '' else (' (depends on: %s)' % repodir_depends)) @@ -384,8 +396,8 @@ def main(args): analyze_partitioning(dbc) detect_lost_object_files(dbc) detect_broken_object_links(dbc) - detect_so_needed_not_found(dbc) - detect_symbols_not_found(dbc) + detect_so_needed_not_resolved(dbc) + detect_symbols_not_resolved(dbc) conn.close() if __name__ == "__main__": diff --git a/fill-repodb.py b/fill-repodb.py index ac357d5..2217615 100755 --- a/fill-repodb.py +++ b/fill-repodb.py @@ -40,9 +40,14 @@ def getFileList(path, ext, filelist): return filelist def parseargs(args): - parser = argparse.ArgumentParser(description=_('extract packages metadata from RPM repositories')) - parser.add_argument("config", metavar="config", + parser = argparse.ArgumentParser(description=_('extract packages metadata' + ' from RPM repositories')) + parser.add_argument('config', metavar='config', help=_('path to repo-analyze-config.xml')) + parser.add_argument('-O', '--no-shared-objects', action='store_true', + help=_('don\'t process shared objects')) + parser.add_argument('-S', '--no-so-symbols', action='store_true', + help=_('don\'t process shared object symbols')) opts = parser.parse_args() return opts @@ -169,7 +174,7 @@ FILE_REC_PATH_IDX = 3 FILE_REC_LINK_IDX = 6 FILE_REC_MARK_IDX = 7 -def register_object(data, pkg_id, pkg, object_file_record, temp_dir): +def register_object(data, pkg_id, pkg, object_file_record, temp_dir, no_so_symbols): so_needed = data['so_needed'] obj_symbols = data['obj_symbols'] obj_id = object_file_record[0] @@ -195,7 +200,7 @@ def register_object(data, pkg_id, pkg, object_file_record, temp_dir): od_out = p.communicate()[0] if p.returncode != 0: file_mark = 'invalid-format' - else: + elif not(no_so_symbols): p = subprocess.Popen(['nm', '-p', '-D', '--undefined-only', temp_obj_file], stdout=subprocess.PIPE, @@ -268,7 +273,9 @@ def extract_files(pkg, files_list, obj_so_files_idx, temp_dir): return False return True -def process_package_worker(num, queue_in, generator, gen_lock, db_struct, repodir_id, build_archs, temp_dir): +def process_package_worker(num, queue_in, generator, gen_lock, db_struct, + repodir_id, build_archs, temp_dir, + no_shared_objects, no_so_symbols): rpm_ts = rpm.TransactionSet() rpm_ts.setVSFlags(~(rpm.RPMVSF_NEEDPAYLOAD)) @@ -340,6 +347,8 @@ def process_package_worker(num, queue_in, generator, gen_lock, db_struct, repodi dir_name = os.path.dirname(file_path) if dir_name not in files_dirs: files_dirs[dir_name] = True + if no_shared_objects: + continue if os.path.splitext(file_name)[1] in \ ['.debug', '.xz', '.conf', '.py', '.c', '.h', '.hpp', '.png', '.cc', '.cpp', '.sh', '.java', '.pl', '.patch', '.desktop']: @@ -370,7 +379,8 @@ def process_package_worker(num, queue_in, generator, gen_lock, db_struct, repodi os.makedirs(pkg_temp_dir) if extract_files(pkg, files_list, obj_so_files_idx, pkg_temp_dir): for i in obj_so_files_idx: - register_object(data, pkg_id, pkg, files_list[i], pkg_temp_dir) + register_object(data, pkg_id, pkg, files_list[i], + pkg_temp_dir, no_so_symbols) shutil.rmtree(pkg_temp_dir, True) @@ -405,7 +415,8 @@ INSERT INTO obj_symbols(obj_file_id, name, sym_type) VALUES(?, ?, ?) generator_value = 0 -def process_repodir(repodir_path, repodir_id, build_archs, conn, db_struct, tempdir): +def process_repodir(repodir_path, repodir_id, build_archs, conn, db_struct, + tempdir, no_shared_objects, no_so_symbols): rpm_list = [] rpm_list = getFileList(repodir_path, '.rpm', rpm_list) @@ -506,7 +517,8 @@ CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY NOT NULL, for i in xrange(NUM_PROCESSES): worker = mp.Process(target = process_package_worker, args = (i, queue_in, id_generator, generator_lock, db_struct, - repodir_id, build_archs, tempdir)) + repodir_id, build_archs, tempdir, + no_shared_objects, no_so_symbols)) workers.append(worker) worker.start() queue_in.join() @@ -543,7 +555,9 @@ def main(args): build_archs = [None] if xrepodir.get('sources') != '.' else \ get_build_archs(xrepodir, config_root.find('repositories')) - process_repodir(xrepodir.get('path'), repodir_id, build_archs, conn, rpm_db_struct, tempdir) + process_repodir(xrepodir.get('path'), repodir_id, build_archs, conn, + rpm_db_struct, tempdir, options.no_shared_objects, + options.no_so_symbols) shutil.rmtree(tempdir, True) index_database(conn)