Allow for partial rpm's analysis

This commit is contained in:
Alexander Lakhin 2014-02-07 13:53:14 +04:00
parent 52059b73d8
commit 3712dda39c
3 changed files with 49 additions and 22 deletions

3
README
View file

@ -3,7 +3,8 @@
1. Настроить структуру репозиториев и пути в repo-analyze-config.xml 1. Настроить структуру репозиториев и пути в repo-analyze-config.xml
2. Заполнить базу данных информацией из репозиториев: 2. Заполнить базу данных информацией из репозиториев:
fill-repodb.py repo-analyze-config.xml fill-repodb.py repo-analyze-config.xml
* Скрипт создаёт в текущем каталоге базу данных repo.db размером около 1 Гб * Для ускорения можно отключить обаботку .so и их символов ключами -O -S соответственно.
** Скрипт создаёт в текущем каталоге базу данных repo.db размером около 2 Гб (при полной обработке).
3. Подготовить базу данных к анализу: 3. Подготовить базу данных к анализу:
prepare-repodb.py prepare-repodb.py
4. Выполнить анализ/проверки: 4. Выполнить анализ/проверки:

View file

@ -232,8 +232,8 @@ SELECT packages.id, packages.name, nvra
print 'Total: %d' % len(singles) print 'Total: %d' % len(singles)
def detect_lost_object_files(dbc): def detect_lost_object_files(dbc):
print '===' header = '===\n' \
print 'Lost object (executable) files (provided but not found):' 'Lost object (executable) files (provided but not found):'
repodirs = dbc.execute(""" repodirs = dbc.execute("""
SELECT id, name, sources, path FROM repodirs ORDER BY id SELECT id, name, sources, path FROM repodirs ORDER BY id
""").fetchall() """).fetchall()
@ -246,14 +246,17 @@ SELECT nvra, package_files.path, mark
ORDER BY packages.name, package_files.path ORDER BY packages.name, package_files.path
""", [rd_id]).fetchall() """, [rd_id]).fetchall()
if lost_object_files: if lost_object_files:
if header:
print header
header = None
print '%d) %s' % (rd_id, rd_name) print '%d) %s' % (rd_id, rd_name)
for lof in lost_object_files: for lof in lost_object_files:
print '\t%s: %s' % (lof[0], lof[1]) print '\t%s: %s' % (lof[0], lof[1])
print 'Total: %d' % len(lost_object_files) print 'Total: %d' % len(lost_object_files)
def detect_broken_object_links(dbc): def detect_broken_object_links(dbc):
print '===' header = '===\n' \
print 'Invalid object (executable) file links:' 'Invalid object (executable) file links:'
repodirs = dbc.execute(""" repodirs = dbc.execute("""
SELECT id, name, sources, path FROM repodirs ORDER BY id SELECT id, name, sources, path FROM repodirs ORDER BY id
""").fetchall() """).fetchall()
@ -267,6 +270,9 @@ SELECT nvra, package_files.path, link_to_path, mark
ORDER BY packages.name, package_files.path ORDER BY packages.name, package_files.path
""", [rd_id]).fetchall() """, [rd_id]).fetchall()
if broken_object_links: if broken_object_links:
if header:
print header
header = None
print '%d) %s' % (rd_id, rd_name) print '%d) %s' % (rd_id, rd_name)
for bol in broken_object_links: for bol in broken_object_links:
print '\t%s: %s -/-> %s' % \ print '\t%s: %s -/-> %s' % \
@ -279,7 +285,7 @@ SELECT depend_repodir_name FROM repodir_depends WHERE repodir_id = ?
""", [repodir_id]).fetchall() """, [repodir_id]).fetchall()
return ', '.join([dep_repo[0] for dep_repo in dep_repos]) return ', '.join([dep_repo[0] for dep_repo in dep_repos])
def detect_so_needed_not_found(dbc): def detect_so_needed_not_resolved(dbc):
repodirs = dbc.execute(""" repodirs = dbc.execute("""
SELECT id, name, sources, path FROM repodirs ORDER BY id SELECT id, name, sources, path FROM repodirs ORDER BY id
""").fetchall() """).fetchall()
@ -305,8 +311,8 @@ SELECT COUNT(1) FROM packages CROSS JOIN package_files CROSS JOIN so_needed CROS
""", [rd_id]).fetchone() """, [rd_id]).fetchone()
print '%d) %s: %d' % (rd_id, rd_name, objects_needed_resolved2[0]) print '%d) %s: %d' % (rd_id, rd_name, objects_needed_resolved2[0])
print '===' header = '===' \
print 'Objects needed but not resolved:' 'Objects needed but not resolved:'
for repodir in repodirs: for repodir in repodirs:
(rd_id, rd_name) = (repodir[0], repodir[1]) (rd_id, rd_name) = (repodir[0], repodir[1])
objects_needed_not_resolved = dbc.execute(""" objects_needed_not_resolved = dbc.execute("""
@ -318,6 +324,9 @@ SELECT packages.nvra, package_files.path, so_needed.name
""", [rd_id]).fetchall() """, [rd_id]).fetchall()
if objects_needed_not_resolved: if objects_needed_not_resolved:
repodir_depends = get_repodir_depends(dbc, rd_id) repodir_depends = get_repodir_depends(dbc, rd_id)
if header:
print header
header = None
print ('%d) %s' % (rd_id, rd_name)) + \ print ('%d) %s' % (rd_id, rd_name)) + \
('' if repodir_depends == '' else ('' if repodir_depends == '' else
(' (depends on: %s)' % repodir_depends)) (' (depends on: %s)' % repodir_depends))
@ -325,7 +334,7 @@ SELECT packages.nvra, package_files.path, so_needed.name
print '\t%s: %s -?-> %s' % (obj_nr[0], obj_nr[1], obj_nr[2]) print '\t%s: %s -?-> %s' % (obj_nr[0], obj_nr[1], obj_nr[2])
print 'Total: %d' % len(objects_needed_not_resolved) print 'Total: %d' % len(objects_needed_not_resolved)
def detect_symbols_not_found(dbc): def detect_symbols_not_resolved(dbc):
repodirs = dbc.execute(""" repodirs = dbc.execute("""
SELECT id, name, sources, path FROM repodirs ORDER BY id SELECT id, name, sources, path FROM repodirs ORDER BY id
""").fetchall() """).fetchall()
@ -355,8 +364,8 @@ SELECT COUNT(1) FROM packages CROSS JOIN package_files CROSS JOIN obj_symbols CR
""", [rd_id]).fetchone() """, [rd_id]).fetchone()
print '%d) %s: %d' % (rd_id, rd_name, symbols_resolved3[0]) print '%d) %s: %d' % (rd_id, rd_name, symbols_resolved3[0])
print '===' header = '===' \
print 'Symbols not resolved:' 'Symbols not resolved:'
for repodir in repodirs: for repodir in repodirs:
(rd_id, rd_name) = (repodir[0], repodir[1]) (rd_id, rd_name) = (repodir[0], repodir[1])
symbols_not_resolved = dbc.execute(""" symbols_not_resolved = dbc.execute("""
@ -368,6 +377,9 @@ SELECT packages.nvra, package_files.path, obj_symbols.name
""", [rd_id]).fetchall() """, [rd_id]).fetchall()
if symbols_not_resolved: if symbols_not_resolved:
repodir_depends = get_repodir_depends(dbc, rd_id) repodir_depends = get_repodir_depends(dbc, rd_id)
if header:
print header
header = None
print ('%d) %s' % (rd_id, rd_name)) + \ print ('%d) %s' % (rd_id, rd_name)) + \
('' if repodir_depends == '' else ('' if repodir_depends == '' else
(' (depends on: %s)' % repodir_depends)) (' (depends on: %s)' % repodir_depends))
@ -384,8 +396,8 @@ def main(args):
analyze_partitioning(dbc) analyze_partitioning(dbc)
detect_lost_object_files(dbc) detect_lost_object_files(dbc)
detect_broken_object_links(dbc) detect_broken_object_links(dbc)
detect_so_needed_not_found(dbc) detect_so_needed_not_resolved(dbc)
detect_symbols_not_found(dbc) detect_symbols_not_resolved(dbc)
conn.close() conn.close()
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -40,9 +40,14 @@ def getFileList(path, ext, filelist):
return filelist return filelist
def parseargs(args): def parseargs(args):
parser = argparse.ArgumentParser(description=_('extract packages metadata from RPM repositories')) parser = argparse.ArgumentParser(description=_('extract packages metadata'
parser.add_argument("config", metavar="config", ' from RPM repositories'))
parser.add_argument('config', metavar='config',
help=_('path to repo-analyze-config.xml')) help=_('path to repo-analyze-config.xml'))
parser.add_argument('-O', '--no-shared-objects', action='store_true',
help=_('don\'t process shared objects'))
parser.add_argument('-S', '--no-so-symbols', action='store_true',
help=_('don\'t process shared object symbols'))
opts = parser.parse_args() opts = parser.parse_args()
return opts return opts
@ -169,7 +174,7 @@ FILE_REC_PATH_IDX = 3
FILE_REC_LINK_IDX = 6 FILE_REC_LINK_IDX = 6
FILE_REC_MARK_IDX = 7 FILE_REC_MARK_IDX = 7
def register_object(data, pkg_id, pkg, object_file_record, temp_dir): def register_object(data, pkg_id, pkg, object_file_record, temp_dir, no_so_symbols):
so_needed = data['so_needed'] so_needed = data['so_needed']
obj_symbols = data['obj_symbols'] obj_symbols = data['obj_symbols']
obj_id = object_file_record[0] obj_id = object_file_record[0]
@ -195,7 +200,7 @@ def register_object(data, pkg_id, pkg, object_file_record, temp_dir):
od_out = p.communicate()[0] od_out = p.communicate()[0]
if p.returncode != 0: if p.returncode != 0:
file_mark = 'invalid-format' file_mark = 'invalid-format'
else: elif not(no_so_symbols):
p = subprocess.Popen(['nm', '-p', '-D', '--undefined-only', p = subprocess.Popen(['nm', '-p', '-D', '--undefined-only',
temp_obj_file], temp_obj_file],
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
@ -268,7 +273,9 @@ def extract_files(pkg, files_list, obj_so_files_idx, temp_dir):
return False return False
return True return True
def process_package_worker(num, queue_in, generator, gen_lock, db_struct, repodir_id, build_archs, temp_dir): def process_package_worker(num, queue_in, generator, gen_lock, db_struct,
repodir_id, build_archs, temp_dir,
no_shared_objects, no_so_symbols):
rpm_ts = rpm.TransactionSet() rpm_ts = rpm.TransactionSet()
rpm_ts.setVSFlags(~(rpm.RPMVSF_NEEDPAYLOAD)) rpm_ts.setVSFlags(~(rpm.RPMVSF_NEEDPAYLOAD))
@ -340,6 +347,8 @@ def process_package_worker(num, queue_in, generator, gen_lock, db_struct, repodi
dir_name = os.path.dirname(file_path) dir_name = os.path.dirname(file_path)
if dir_name not in files_dirs: if dir_name not in files_dirs:
files_dirs[dir_name] = True files_dirs[dir_name] = True
if no_shared_objects:
continue
if os.path.splitext(file_name)[1] in \ if os.path.splitext(file_name)[1] in \
['.debug', '.xz', '.conf', '.py', '.c', '.h', '.hpp', '.png', ['.debug', '.xz', '.conf', '.py', '.c', '.h', '.hpp', '.png',
'.cc', '.cpp', '.sh', '.java', '.pl', '.patch', '.desktop']: '.cc', '.cpp', '.sh', '.java', '.pl', '.patch', '.desktop']:
@ -370,7 +379,8 @@ def process_package_worker(num, queue_in, generator, gen_lock, db_struct, repodi
os.makedirs(pkg_temp_dir) os.makedirs(pkg_temp_dir)
if extract_files(pkg, files_list, obj_so_files_idx, pkg_temp_dir): if extract_files(pkg, files_list, obj_so_files_idx, pkg_temp_dir):
for i in obj_so_files_idx: for i in obj_so_files_idx:
register_object(data, pkg_id, pkg, files_list[i], pkg_temp_dir) register_object(data, pkg_id, pkg, files_list[i],
pkg_temp_dir, no_so_symbols)
shutil.rmtree(pkg_temp_dir, True) shutil.rmtree(pkg_temp_dir, True)
@ -405,7 +415,8 @@ INSERT INTO obj_symbols(obj_file_id, name, sym_type) VALUES(?, ?, ?)
generator_value = 0 generator_value = 0
def process_repodir(repodir_path, repodir_id, build_archs, conn, db_struct, tempdir): def process_repodir(repodir_path, repodir_id, build_archs, conn, db_struct,
tempdir, no_shared_objects, no_so_symbols):
rpm_list = [] rpm_list = []
rpm_list = getFileList(repodir_path, '.rpm', rpm_list) rpm_list = getFileList(repodir_path, '.rpm', rpm_list)
@ -506,7 +517,8 @@ CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY NOT NULL,
for i in xrange(NUM_PROCESSES): for i in xrange(NUM_PROCESSES):
worker = mp.Process(target = process_package_worker, worker = mp.Process(target = process_package_worker,
args = (i, queue_in, id_generator, generator_lock, db_struct, args = (i, queue_in, id_generator, generator_lock, db_struct,
repodir_id, build_archs, tempdir)) repodir_id, build_archs, tempdir,
no_shared_objects, no_so_symbols))
workers.append(worker) workers.append(worker)
worker.start() worker.start()
queue_in.join() queue_in.join()
@ -543,7 +555,9 @@ def main(args):
build_archs = [None] if xrepodir.get('sources') != '.' else \ build_archs = [None] if xrepodir.get('sources') != '.' else \
get_build_archs(xrepodir, get_build_archs(xrepodir,
config_root.find('repositories')) config_root.find('repositories'))
process_repodir(xrepodir.get('path'), repodir_id, build_archs, conn, rpm_db_struct, tempdir) process_repodir(xrepodir.get('path'), repodir_id, build_archs, conn,
rpm_db_struct, tempdir, options.no_shared_objects,
options.no_so_symbols)
shutil.rmtree(tempdir, True) shutil.rmtree(tempdir, True)
index_database(conn) index_database(conn)