repo-analyzer/analyze-repodb.py

523 lines
21 KiB
Python
Raw Normal View History

#!/usr/bin/python
# -*- coding: utf-8 -*-
2014-02-06 11:44:02 +04:00
import os
import sys
import gettext
import argparse
import sqlite3
import string
import rpm
import re
gettext.install('urpm-tools')
2014-02-06 11:44:02 +04:00
DB = 'repo.db'
def parseargs():
parser = argparse.ArgumentParser(description=_('analyze repositories metadata '
' from repo.db'))
parser.add_argument('-d', '--dot-graphs', action='store_true',
help=_('visualize dependencies in .DOT graphs'))
opts = parser.parse_args()
return opts
2014-02-06 11:44:02 +04:00
def detect_broken_dependencies(dbc):
def print_broken_packages():
for rpb_name in sorted(repo_packages_broken.keys()):
rpb_id = repo_packages_broken[rpb_name]
dep_chain = []
dep_id = all_broken[rpb_id]['depid']
while dep_id != 0:
dep_chain.append('%s (%d)' % (all_broken[dep_id]['nvra'], all_broken[dep_id]['repo']))
dep_id = all_broken[dep_id]['depid']
print '\t' + rpb_name + ' => '+ (' => '.join(dep_chain))
print 'Total: %d' % len(repo_packages_broken)
print ''
# Detect broken dependencies with recursion
repodirs_analyzed = []
broken = {}
broken_level0 = dbc.execute("""
SELECT packages.id, nvra, repodir_id, repodirs.name,
rpm_requires.name AS req_name, rpm_requires.build_arch AS req_arch
FROM packages, repodirs, rpm_requires
WHERE packages.repodir_id = repodirs.id AND
packages.id=rpm_requires.package_id AND
NOT EXISTS (SELECT 1 FROM package_depend_res pdr
WHERE pdr.package_id = packages.id AND
pdr.requires_id = rpm_requires.id)
2014-02-06 11:44:02 +04:00
ORDER BY repodir_id, nvra""").fetchall()
all_broken = {}
if len(broken_level0) > 0:
print 'Broken dependencies (bottom level):'
bp_reqs = []
pre_repodir_id = -1
pre_bp_id = -1
pre_bp_nvra = -1
pre_cnt = 0
for bp in broken_level0:
(bp_id, bp_nvra, bp_repodir_id, bp_repodir_name, bp_reqname, bp_reqarch) = \
(bp[0], bp[1], bp[2], bp[3], bp[4], bp[5])
broken[bp_id] = bp_nvra
if pre_bp_id != bp_id and pre_bp_id != -1:
print '\t%s (%s)' % (pre_bp_nvra, ', '.join(bp_reqs))
pre_cnt += 1
bp_reqs = []
if bp_reqarch is not None:
bp_reqname += ':' + bp_reqarch
if bp_reqname not in bp_reqs:
bp_reqs.append(bp_reqname)
if pre_repodir_id != bp_repodir_id:
if pre_repodir_id != -1:
print 'Total: %d' % pre_cnt
print '%d) %s' % (bp_repodir_id, bp_repodir_name)
pre_repodir_id = bp_repodir_id
pre_cnt = 0
if bp_id not in all_broken:
all_broken[bp_id] = {'repo': bp_repodir_id, 'nvra': bp_nvra, 'reqname': bp_reqname, 'depid': 0}
pre_bp_id = bp_id
pre_bp_nvra = bp_nvra
if pre_bp_id != -1:
print '\t%s (%s)' % (pre_bp_nvra, ','.join(bp_reqs))
print 'Total: %d' % pre_cnt
all_broken_cnt = -1
broken_recursive = []
while all_broken_cnt < len(all_broken):
all_broken_cnt = len(all_broken)
pids = ','.join(str(id) for id in all_broken.keys())
packages_broken_recurs = dbc.execute("""
SELECT packages.id, nvra, repodir_id, repodirs.name,
rpm_requires.name AS req_name, build_arch, dep_package_id
FROM packages, repodirs, rpm_requires, package_depend_res
WHERE packages.repodir_id = repodirs.id AND
packages.id = rpm_requires.package_id AND
packages.id = package_depend_res.package_id AND
rpm_requires.id = package_depend_res.requires_id AND
dep_package_id IN (%(pids)s) AND
packages.id NOT IN (%(pids)s)
2014-02-06 11:44:02 +04:00
ORDER BY repodir_id, nvra""" % {'pids': pids}).fetchall()
for packb in packages_broken_recurs:
all_broken[packb[0]] = {'repo': packb[2], 'nvra': packb[1],
'reqname': packb[4], 'build_arch': packb[5],
'depid': packb[6]}
2014-02-06 11:44:02 +04:00
broken_recursive.append(packb[0])
if broken_recursive:
print 'Recursive broken dependencies:'
all_repodirs = dbc.execute("""
SELECT id, name, sources FROM repodirs ORDER BY id""").fetchall()
for rd in all_repodirs:
(rd_id, rd_name, rd_sources) = (rd[0], rd[1], rd[2])
if rd_sources == '.':
archs = dbc.execute("""
SELECT DISTINCT build_arch FROM rpm_requires
WHERE package_id IN (SELECT id FROM packages WHERE repodir_id = ?)
2014-02-06 11:44:02 +04:00
""", [rd_id]).fetchall()
for arch_rec in archs:
repo_packages_broken = {all_broken[id]['nvra']: id for id in broken_recursive \
if all_broken[id]['repo'] == rd_id and all_broken[id]['build_arch'] == arch_rec[0]}
if repo_packages_broken:
print '%d) %s (%s)' % (rd_id, rd_name, arch_rec[0])
print_broken_packages()
else:
repo_packages_broken = {all_broken[id]['nvra']: id for id in broken_recursive if all_broken[id]['repo'] == rd_id}
if repo_packages_broken:
print '%d) %s' % (rd_id, rd_name)
print_broken_packages()
def OutputGraphHead(file_output, dg_name):
"""Output Graph head.
Static information about graph.
"""
file_output.write('\n\ndigraph "%s" {\n' % dg_name + \
'size="20.69,25.52";\nratio="fill";\n' + \
'rankdir="TB";\nnode[style="filled"];\nnode[shape="box"];\n\n')
def OutputGraphTail(file_output):
"""Finish the graph.
"""
file_output.write('}\n')
def render_dot_graphs(dbc):
repodirs = dbc.execute("""
SELECT id, name, sources, path FROM repodirs ORDER BY id
""").fetchall()
for repodir in repodirs:
(rd_id, rd_name) = (repodir[0], repodir[1])
dot_file = open('repo-%d.dot' % rd_id, 'w')
packages_processed = {}
low_level_pkgs = dbc.execute("""
SELECT packages.id, packages.nvra FROM packages
WHERE repodir_id = ? AND
NOT EXISTS (SELECT 1 FROM package_depend_res, packages dp
WHERE package_id = packages.id AND
dp.id = dep_package_id AND dp.repodir_id = ?)
ORDER BY packages.id""", [rd_id, rd_id]).fetchall()
for pkg_rec in low_level_pkgs:
packages_processed[pkg_rec[0]] = pkg_rec[1]
OutputGraphHead(dot_file, rd_name)
pkg_linked = {}
level = 0
curr_level_pkgs = [pkg_rec[0] for pkg_rec in low_level_pkgs]
while len(curr_level_pkgs) > 0:
in_curr_pkgs = ','.join([str(pkg_id)
for pkg_id in curr_level_pkgs])
depend_pkgs = dbc.execute("""
SELECT DISTINCT packages.id, packages.nvra, package_depend_res.dep_package_id
FROM package_depend_res, packages
WHERE repodir_id = ? AND package_depend_res.dep_package_id IN (%s)
AND package_depend_res.package_id = packages.id
ORDER BY packages.id""" % in_curr_pkgs, [rd_id]).fetchall()
next_level_pkgs = []
for pkg_rec in depend_pkgs:
if level == 0:
pkg_linked[pkg_rec[2]] = True
if pkg_rec[0] not in packages_processed:
packages_processed[pkg_rec[0]] = pkg_rec[1]
next_level_pkgs.append(pkg_rec[0])
if pkg_rec[0] != pkg_rec[2]:
dot_file.write('"%s" -> "%s" [color="0.66 1 0.66"];\n' %
(packages_processed[pkg_rec[0]],
packages_processed[pkg_rec[2]]))
if level == 0:
for ll_rec in low_level_pkgs:
if ll_rec[0] not in pkg_linked:
dot_file.write('"%s" [color="0.66 0.66 1"];\n' %
packages_processed[ll_rec[0]])
curr_level_pkgs = next_level_pkgs
level += 1
OutputGraphTail(dot_file)
def detect_loops(dbc):
header = '===\n' \
'Loopbacks:'
repodirs = dbc.execute("""
SELECT id, name, sources, path FROM repodirs ORDER BY id
""").fetchall()
for repodir in repodirs:
(rd_id, rd_name) = (repodir[0], repodir[1])
loopbacks = dbc.execute("""
SELECT p.id, p.nvra, rpm_requires.name
FROM package_depend_res pdr, packages p, rpm_requires
WHERE pdr.package_id = p.id AND pdr.package_id = dep_package_id AND
rpm_requires.id = pdr.requires_id and p.repodir_id = ?
ORDER BY p.nvra, rpm_requires.name
""", [rd_id]).fetchall()
if loopbacks:
if header:
print header
header = None
print '%d) %s' % (rd_id, rd_name)
pre_pkg_id = None
pre_pkg_name = None
requires = []
cnt = 0
for lb_rec in loopbacks:
pkg_id = lb_rec[0]
pkg_name = lb_rec[1]
if pkg_id != pre_pkg_id:
cnt += 1
if pre_pkg_id is not None:
print '\t%s (%s)' % (pre_pkg_name, ','.join(requires))
requires = []
pre_pkg_id = pkg_id
pre_pkg_name = pkg_name
requires.append(lb_rec[2])
if pre_pkg_id is not None:
print '\t%s (%s)' % (pre_pkg_name, ','.join(requires))
print 'Total: %d' % cnt
2014-02-06 11:44:02 +04:00
def detect_lost_sources(dbc):
print '==='
print 'Lost sources:'
repodirs = dbc.execute("""
SELECT id, name, sources, path FROM repodirs WHERE sources <> '.' ORDER BY id
""").fetchall()
for repodir in repodirs:
(rd_id, rd_name) = (repodir[0], repodir[1])
lost_sources = dbc.execute("""
SELECT name, nvra, sourcerpm FROM packages
WHERE repodir_id = ? AND
sourcerpm IS NOT NULL AND sourcerpm_package IS NULL
ORDER BY name
""", [rd_id]).fetchall()
if lost_sources:
print '%d) %s' % (rd_id, rd_name)
for ls in lost_sources:
print '\t%s (%s)' % (ls[1], ls[2])
print 'Total: %d' % len(lost_sources)
def analyze_partitioning(dbc):
print '==='
print 'Possible partitioning:'
repodirs = dbc.execute("""
SELECT id, name, sources, path FROM repodirs WHERE sources <> '.' ORDER BY id
""").fetchall()
for repodir in repodirs:
(rd_id, rd_name) = (repodir[0], repodir[1])
partitions = []
partitioned_packages = []
singles = []
while True:
ppackages = ','.join(str(id) for id in partitioned_packages)
if not ppackages:
ppackages = '0'
pkg1_rec = dbc.execute("""
SELECT id, name, nvra
FROM packages WHERE repodir_id = ? AND id NOT IN (%s)
ORDER BY name
LIMIT 1""" % ppackages, [rd_id]).fetchall()
if not pkg1_rec:
break
if not partitioned_packages:
print '%d) %s' % (rd_id, rd_name)
(pkg_id, pkg_name) = (pkg1_rec[0][0], pkg1_rec[0][2])
partition_names = []
partition_names.append(pkg_name)
partition_ids = []
partition_ids.append(pkg_id)
partitioned_packages.append(pkg_id)
current_level_packages = [pkg_id]
while True:
cl_packages = ','.join(str(id) for id in current_level_packages)
part_packages = ','.join(str(id) for id in partition_ids)
upper_packages = dbc.execute("""
SELECT packages.id, packages.name, nvra
FROM packages, package_depend_res
WHERE packages.id = package_depend_res.package_id AND
2014-02-06 11:44:02 +04:00
packages.repodir_id = ? AND
package_depend_res.dep_package_id IN (%s) AND
2014-02-06 11:44:02 +04:00
packages.id NOT IN (%s)
ORDER BY packages.name
""" % (cl_packages, part_packages), [rd_id]).fetchall()
lower_packages = dbc.execute("""
SELECT packages.id, packages.name, nvra
FROM packages, package_depend_res
WHERE packages.id = package_depend_res.dep_package_id AND
2014-02-06 11:44:02 +04:00
packages.repodir_id = ? AND
package_depend_res.package_id IN (%s) AND
2014-02-06 11:44:02 +04:00
packages.id NOT IN (%s)
ORDER BY packages.name
""" % (cl_packages, part_packages), [rd_id]).fetchall()
if not upper_packages and not lower_packages:
break
current_level_packages = []
for rec in upper_packages:
if rec[0] not in current_level_packages:
current_level_packages.append(rec[0])
partitioned_packages.append(rec[0])
partition_ids.append(rec[0])
partition_names.append(rec[2])
for rec in lower_packages:
if rec[0] not in current_level_packages:
current_level_packages.append(rec[0])
partitioned_packages.append(rec[0])
partition_ids.append(rec[0])
partition_names.append(rec[2])
if len(partition_names) == 1:
#print partition_names
singles.append(partition_names[0])
#raise Exception('aaa')
else:
for p in sorted(partition_names):
2014-02-06 14:51:12 +04:00
print '\t%s' % p
2014-02-06 11:44:02 +04:00
print 'Total: %d' % len(partition_names)
print '---'
print ''
if len(singles) > 0:
print 'Singles:'
for s in sorted(singles):
print '\t%s' % s
print 'Total: %d' % len(singles)
def detect_lost_object_files(dbc):
2014-02-07 13:53:14 +04:00
header = '===\n' \
'Lost object (executable) files (provided but not found):'
2014-02-06 11:44:02 +04:00
repodirs = dbc.execute("""
SELECT id, name, sources, path FROM repodirs ORDER BY id
""").fetchall()
for repodir in repodirs:
(rd_id, rd_name) = (repodir[0], repodir[1])
lost_object_files = dbc.execute("""
SELECT nvra, package_files.path, mark
FROM packages, package_files
WHERE repodir_id = ? AND packages.id = package_files.package_id AND mark = 'not-found'
ORDER BY packages.name, package_files.path
""", [rd_id]).fetchall()
if lost_object_files:
2014-02-07 13:53:14 +04:00
if header:
print header
header = None
2014-02-06 11:44:02 +04:00
print '%d) %s' % (rd_id, rd_name)
for lof in lost_object_files:
print '\t%s: %s' % (lof[0], lof[1])
print 'Total: %d' % len(lost_object_files)
def detect_broken_object_links(dbc):
2014-02-07 13:53:14 +04:00
header = '===\n' \
'Invalid object (executable) file links:'
2014-02-06 11:44:02 +04:00
repodirs = dbc.execute("""
SELECT id, name, sources, path FROM repodirs ORDER BY id
""").fetchall()
for repodir in repodirs:
(rd_id, rd_name) = (repodir[0], repodir[1])
broken_object_links = dbc.execute("""
SELECT nvra, package_files.path, link_to_path, mark
FROM packages, package_files
WHERE repodir_id = ? AND packages.id = package_files.package_id AND
mark = 'link' AND link_to_path IS NOT NULL AND link_to_file_id IS NULL
ORDER BY packages.name, package_files.path
""", [rd_id]).fetchall()
if broken_object_links:
2014-02-07 13:53:14 +04:00
if header:
print header
header = None
2014-02-06 11:44:02 +04:00
print '%d) %s' % (rd_id, rd_name)
for bol in broken_object_links:
print '\t%s: %s -/-> %s' % \
(bol[0], bol[1], bol[2])
print 'Total: %d' % len(broken_object_links)
def get_repodir_depends(dbc, repodir_id):
dep_repos = dbc.execute("""
SELECT depend_repodir_name FROM repodir_depends WHERE repodir_id = ?
""", [repodir_id]).fetchall()
return ', '.join([dep_repo[0] for dep_repo in dep_repos])
2014-02-07 13:53:14 +04:00
def detect_so_needed_not_resolved(dbc):
2014-02-06 11:44:02 +04:00
repodirs = dbc.execute("""
SELECT id, name, sources, path FROM repodirs ORDER BY id
""").fetchall()
print '==='
print 'Objects needed and resolved by rpm requires-provides:'
for repodir in repodirs:
(rd_id, rd_name) = (repodir[0], repodir[1])
objects_needed_resolved1 = dbc.execute("""
SELECT COUNT(1) FROM packages CROSS JOIN package_files CROSS JOIN so_needed CROSS JOIN so_needed_res
WHERE repodir_id = ? AND package_files.package_id = packages.id AND
so_needed.obj_file_id = package_files.id AND so_needed_id = so_needed.id AND res_type = 1
""", [rd_id]).fetchone()
print '%d) %s: %d' % (rd_id, rd_name, objects_needed_resolved1[0])
print '==='
print 'Objects needed and resolved by flat search:'
for repodir in repodirs:
(rd_id, rd_name) = (repodir[0], repodir[1])
objects_needed_resolved2 = dbc.execute("""
SELECT COUNT(1) FROM packages CROSS JOIN package_files CROSS JOIN so_needed CROSS JOIN so_needed_res
WHERE repodir_id = ? AND package_files.package_id = packages.id AND
so_needed.obj_file_id = package_files.id AND so_needed_id = so_needed.id AND res_type = 2
""", [rd_id]).fetchone()
print '%d) %s: %d' % (rd_id, rd_name, objects_needed_resolved2[0])
2014-02-07 13:53:14 +04:00
header = '===' \
'Objects needed but not resolved:'
2014-02-06 11:44:02 +04:00
for repodir in repodirs:
(rd_id, rd_name) = (repodir[0], repodir[1])
objects_needed_not_resolved = dbc.execute("""
SELECT packages.nvra, package_files.path, so_needed.name
FROM packages CROSS JOIN package_files CROSS JOIN so_needed
LEFT OUTER JOIN so_needed_res ON so_needed_id = so_needed.id
WHERE repodir_id = ? AND package_files.package_id = packages.id AND
so_needed.obj_file_id = package_files.id AND so_needed_id IS NULL
""", [rd_id]).fetchall()
if objects_needed_not_resolved:
repodir_depends = get_repodir_depends(dbc, rd_id)
2014-02-07 13:53:14 +04:00
if header:
print header
header = None
2014-02-06 11:44:02 +04:00
print ('%d) %s' % (rd_id, rd_name)) + \
('' if repodir_depends == '' else
(' (depends on: %s)' % repodir_depends))
for obj_nr in objects_needed_not_resolved:
print '\t%s: %s -?-> %s' % (obj_nr[0], obj_nr[1], obj_nr[2])
print 'Total: %d' % len(objects_needed_not_resolved)
2014-02-07 13:53:14 +04:00
def detect_symbols_not_resolved(dbc):
2014-02-06 11:44:02 +04:00
repodirs = dbc.execute("""
SELECT id, name, sources, path FROM repodirs ORDER BY id
""").fetchall()
print '==='
print 'Symbols resolved by .so NEEDED search:'
for repodir in repodirs:
(rd_id, rd_name) = (repodir[0], repodir[1])
symbols_resolved1_2 = dbc.execute("""
SELECT COUNT(1) FROM packages CROSS JOIN package_files CROSS JOIN obj_symbols CROSS JOIN obj_symbols_res
WHERE packages.repodir_id = ? AND packages.id = package_files.package_id AND
package_files.id = obj_symbols.obj_file_id AND sym_type = 0 AND
obj_symbols_res.obj_sym_id = obj_symbols.id AND res_type IN (1, 2)
""", [rd_id]).fetchone()
print '%d) %s: %d' % (rd_id, rd_name, symbols_resolved1_2[0])
print '==='
print 'Symbols resolved by flat search:'
for repodir in repodirs:
(rd_id, rd_name) = (repodir[0], repodir[1])
symbols_resolved3 = dbc.execute("""
SELECT COUNT(1) FROM packages CROSS JOIN package_files CROSS JOIN obj_symbols CROSS JOIN obj_symbols_res
WHERE packages.repodir_id = ? AND packages.id = package_files.package_id AND
package_files.id = obj_symbols.obj_file_id AND sym_type = 0 AND
obj_symbols_res.obj_sym_id = obj_symbols.id AND res_type = 3
""", [rd_id]).fetchone()
print '%d) %s: %d' % (rd_id, rd_name, symbols_resolved3[0])
2014-02-07 13:53:14 +04:00
header = '===' \
'Symbols not resolved:'
2014-02-06 11:44:02 +04:00
for repodir in repodirs:
(rd_id, rd_name) = (repodir[0], repodir[1])
symbols_not_resolved = dbc.execute("""
SELECT packages.nvra, package_files.path, obj_symbols.name
FROM packages CROSS JOIN package_files CROSS JOIN obj_symbols
WHERE packages.repodir_id = ? AND packages.id = package_files.package_id AND
package_files.id = obj_symbols.obj_file_id AND sym_type = 0 AND
NOT EXISTS (SELECT 1 FROM obj_symbols_res WHERE obj_sym_id = obj_symbols.id)
""", [rd_id]).fetchall()
if symbols_not_resolved:
repodir_depends = get_repodir_depends(dbc, rd_id)
2014-02-07 13:53:14 +04:00
if header:
print header
header = None
2014-02-06 11:44:02 +04:00
print ('%d) %s' % (rd_id, rd_name)) + \
('' if repodir_depends == '' else
(' (depends on: %s)' % repodir_depends))
for sym_nr in symbols_not_resolved:
print '\t%s: %s -?-> %s' % (sym_nr[0], sym_nr[1], sym_nr[2])
print 'Total: %d' % len(symbols_not_resolved)
def main(args):
options = parseargs()
2014-02-06 11:44:02 +04:00
conn = sqlite3.connect(DB)
dbc = conn.cursor()
detect_broken_dependencies(dbc)
if options.dot_graphs:
render_dot_graphs(dbc)
#detect_loops(dbc)
2014-02-06 11:44:02 +04:00
detect_lost_sources(dbc)
analyze_partitioning(dbc)
detect_lost_object_files(dbc)
detect_broken_object_links(dbc)
2014-02-07 13:53:14 +04:00
detect_so_needed_not_resolved(dbc)
detect_symbols_not_resolved(dbc)
2014-02-06 11:44:02 +04:00
conn.close()
if __name__ == "__main__":
main(sys.argv)