repo-analyzer/prepare-repodb.py
2014-02-13 12:03:20 +04:00

419 lines
19 KiB
Python
Executable file

#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import sys
import gettext
import argparse
import sqlite3
import re
from versutils import *
DB = 'repo.db'
def process_repodir_requires(dbc, repodir_id, repodir_name, repodir_depends, requires_build_arch):
global n
print 'Processing repo %d: %s (with depends: %s)' % (repodir_id, repodir_name, str(repodir_depends))
package_requires = dbc.execute("""
SELECT packages.id AS package_id, packages.name AS package_name, packages.nvra,
req.id, req.name, flags, req.version
FROM packages, rpm_requires req
WHERE repodir_id = ? AND req.package_id = packages.id %s
ORDER BY packages.name, req.name
""" % ((" AND build_arch = '%s'" % requires_build_arch)
if requires_build_arch is not None else ""),
[repodir_id]).fetchall()
search_repodirs = [repodir_id]
search_repodirs.extend(repodir_depends)
in_repodirs = ','.join(str(id) for id in search_repodirs)
# print 'package requires count: ', len(package_requires)
broken_dep = 0
requires_cache = {}
# TODO: Reuse the cache for dependent repositories???
for packreq in package_requires:
(cpackage_id, package_nvra, requires_id, requires_name, requires_flags, requires_version) = \
(packreq[0], packreq[2], packreq[3], packreq[4], packreq[5], packreq[6])
requirement_uid = requires_name + '\0' + str(requires_flags) + '\0' + requires_version
dep_res = requires_cache.get(requirement_uid, None)
if dep_res is None:
dep_res = []
if (re.match(r'\A(rpmlib|executable)\(.+\)\Z', requires_name)):
# see if($N=~/\A(rpmlib|executable)\(.+\)\Z/) in urpm_repoclosure.pl
dep_res.append({})
else:
depend_candidates = dbc.execute("""
SELECT packages.id AS package_id, packages.name AS package_name, packages.nvra,
prov.id, prov.name, flags, prov.version
FROM packages, rpm_provides AS prov
WHERE prov.package_id = packages.id AND repodir_id IN (%s) AND prov.name = ?
ORDER by packages.name, packages.nvra
""" % in_repodirs, [requires_name]).fetchall()
preferred_version = None
for dep_cand in depend_candidates:
(provides_flags, provides_version) = \
(dep_cand[5], dep_cand[6])
#print 'provides_version: ', provides_flags, ' ', provides_version
if provides_flags & RPMSENSE_SENSEMASK == 0:
if not provides_version:
provides_version = '*'
else:
raise Exception('Invalid provides version '
'(flags = %d, version = %s)!' %
(provides_flags, provides_version))
if version_ok(requires_version,
requires_flags & RPMSENSE_SENSEMASK,
provides_version):
better_version = preferred_version is None or \
provides_version == '*'
if not better_version:
better_version = version_ok(provides_version,
RPMSENSE_GREATER,
preferred_version)
if better_version:
preferred_version = provides_version
if preferred_version is not None:
for dep_cand in depend_candidates:
(pkg_id, provides_id, provides_version) = \
(dep_cand[0], dep_cand[3], dep_cand[6])
if provides_version == preferred_version or \
version_ok(provides_version, RPMSENSE_EQUAL,
preferred_version):
dep_res.append({'prov_id': provides_id,
'pkg_id': pkg_id})
if len(dep_res) == 0 and requires_name.startswith('/'): # file dependency
if (requires_flags & (RPMSENSE_SCRIPT_POST |
RPMSENSE_SCRIPT_PREUN |
RPMSENSE_SCRIPT_POSTUN)) != 0:
int_files_cnt = dbc.execute("""
SELECT COUNT(1) FROM package_files WHERE package_id = ? AND path = ?
""", [cpackage_id, requires_name]).fetchone()
if int_files_cnt[0] > 0:
dep_res.append({})
else:
#TODO: Check file dependencies (/usr/bin/python (required by ant-scripts-1.7.1-7.0.6.noarch), /usr/sbin/useradd (required by tomcat5-5.5.28-0.5.2.noarch))?
files_deps = dbc.execute("""
SELECT package_id FROM package_files
WHERE path = ? AND
package_id in (SELECT id FROM packages WHERE repodir_id IN (%s))
""" % in_repodirs, [requires_name]).fetchall()
for file_dep in files_deps:
dep_res.append({'pkg_id': file_dep[0]})
if len(dep_res) == 0 and (requires_flags & RPMSENSE_MISSINGOK) != 0:
dep_res.append({})
if len(dep_res) > 0:
for res_rec in dep_res:
dbc.execute("""
INSERT INTO package_depend_res(package_id, requires_id,
provides_id, dep_package_id)
VALUES (?, ?, ?, ?)
""", [cpackage_id, requires_id, res_rec.get('prov_id'), res_rec.get('pkg_id')])
else:
print requires_name, ' ', requires_version, ' (required by %s)' % package_nvra, ' not found!!!'
broken_dep += 1
requires_cache[requirement_uid] = dep_res
n = n + 1
#print "n = ", n
# if n == 60000:
# break
print 'broken_deps: ', broken_dep
print ''
def extract_arch(arch_template, repo_name):
arch_sign = '$arch'
spos = arch_template.find(arch_sign)
if spos >= 0:
repo_prefix = arch_template[:spos]
repo_postfix = arch_template[spos + len(arch_sign):]
if repo_name.startswith(repo_prefix) and \
repo_name.endswith(repo_postfix):
return repo_name[len(repo_prefix) :
len(repo_name) - len(repo_postfix)]
return None
def process_repodir_file_links(dbc, repodir_id, repodir_name, repodir_depends):
package_files_links = dbc.execute("""
SELECT packages.id AS package_id, packages.name AS package_name, packages.nvra,
package_files.id AS object_id, package_files.path, package_files.link_to_path
FROM packages, package_files
WHERE repodir_id = ? AND package_files.package_id = packages.id AND
link_to_path IS NOT NULL
ORDER BY packages.name, link_to_path
""", [repodir_id]).fetchall()
for file_link in package_files_links:
pkg_id = file_link[0]
pkg_name = file_link[1]
pkg_nvra = file_link[2]
object_id = file_link[3]
target_obj_id = None
target_path = os.path.normpath(file_link[5])
target_paths = {}
target_paths[target_path] = True
while target_path != '':
new_target_path = None
tofile = dbc.execute("""
SELECT id, link_to_path FROM package_files WHERE path = ? AND package_id = ?
""", [target_path, pkg_id]).fetchone()
if tofile:
target_obj_id = tofile[0]
new_target_path = tofile[1]
if not target_obj_id:
# Just two levels of dependency recursion - TODO: Full depth recursion?
tofile = dbc.execute("""
SELECT id, link_to_path FROM package_files WHERE path = ? AND package_id IN (
SELECT dep_package_id FROM package_depend_res WHERE package_id = ?
UNION
SELECT dep_package_id FROM package_depend_res WHERE package_id IN
(SELECT dep_package_id FROM package_depend_res WHERE package_id = ?)
)
""", [target_path, pkg_id, pkg_id]).fetchone()
if tofile:
target_obj_id = tofile[0]
new_target_path = tofile[1]
if new_target_path is None:
break
target_path = os.path.normpath(new_target_path)
if target_path in target_paths:
print 'Link loop detected! %s: %s -> %s' % (pkg_nvra, file_link[5], target_path)
target_obj_id = None
break
target_paths[target_path] = True
if target_obj_id:
dbc.execute("""
UPDATE package_files SET link_to_file_id = ? WHERE id = ?
""", [target_obj_id, object_id])
else:
# print 'target %s not found (%d: %s)' % (target_path, pkg_id, pkg_name)
pass
def process_repodir_so_needed(dbc, repodir_id, repodir_name, repodir_depends):
print 'Searching object files resolutions (1)...'
dbc.execute("""
INSERT INTO so_needed_res(so_needed_id, dep_obj_file_id, res_type)
SELECT so_needed.id, tpf.id, 1 FROM packages
CROSS JOIN package_files spf CROSS JOIN so_needed CROSS JOIN rpm_requires
CROSS JOIN package_depend_res dep_res CROSS JOIN package_files tpf
WHERE so_needed.obj_file_id = spf.id AND spf.package_id = packages.id AND
packages.repodir_id = ? AND spf.package_id = rpm_requires.package_id AND
(so_needed.name = rpm_requires.name OR
so_needed.name || '()(64bit)' = rpm_requires.name) AND
packages.id = dep_res.package_id AND
rpm_requires.id = dep_res.requires_id AND
dep_res.dep_package_id = tpf.package_id AND
so_needed.name = tpf.basename
""", [repodir_id])
search_repodirs = [repodir_id]
search_repodirs.extend(repodir_depends)
in_repodirs = ','.join(str(id) for id in search_repodirs)
objects_not_resolved1 = dbc.execute("""
SELECT packages.id AS package_id, packages.nvra,
package_files.id AS object_id, package_files.basename AS object_name,
so_needed.id AS so_needed_id, so_needed.name AS so_needed_name
FROM packages CROSS JOIN package_files CROSS JOIN so_needed
WHERE repodir_id = ? AND package_files.package_id = packages.id AND
so_needed.obj_file_id = package_files.id AND
NOT EXISTS (SELECT 1 FROM so_needed_res
WHERE so_needed_res.so_needed_id = so_needed.id)
ORDER BY packages.nvra, package_files.basename, so_needed.name
""", [repodir_id]).fetchall()
print 'Object files not resolved by rpm requires-provides: ', len(objects_not_resolved1)
if objects_not_resolved1:
print 'Searching object files resolutions (2)...'
in_so_needed = ','.join(str(obj_rec[4]) for obj_rec in objects_not_resolved1)
dbc.execute("""
INSERT INTO so_needed_res(so_needed_id, dep_obj_file_id, res_type)
SELECT so_needed.id, tpf.id, 2 FROM packages, package_files tpf, so_needed
WHERE packages.repodir_id IN (%s) AND packages.id = tpf.package_id AND
so_needed.id IN (%s) AND tpf.basename = so_needed.name
""" % (in_repodirs, in_so_needed))
objects_not_resolved2 = dbc.execute("""
SELECT packages.id AS package_id, packages.nvra,
package_files.id AS object_id, package_files.basename AS object_name,
so_needed.id AS so_needed_id, so_needed.name AS so_needed_name
FROM packages, package_files, so_needed
WHERE repodir_id = ? AND package_files.package_id = packages.id AND
so_needed.obj_file_id = package_files.id AND
NOT EXISTS (SELECT 1 FROM so_needed_res WHERE so_needed_res.so_needed_id = so_needed.id)
ORDER BY packages.nvra, package_files.basename, so_needed.name
""", [repodir_id]).fetchall()
print 'Object files not resolved: ', len(objects_not_resolved2)
def process_repodir_obj_symbols(dbc, repodir_id, repodir_name, repodir_depends):
print 'Searching symbols resolutions (1)...'
# EXPLAIN QUERY PLAN
dbc.execute("""
INSERT INTO obj_symbols_res(obj_sym_id, dep_obj_sym_id, res_type)
SELECT sos.id, tos.id, 1 FROM packages CROSS JOIN package_files spf CROSS JOIN obj_symbols sos CROSS JOIN
so_needed CROSS JOIN so_needed_res CROSS JOIN package_files tpf CROSS JOIN obj_symbols tos
WHERE packages.repodir_id = ? AND packages.id = spf.package_id AND spf.id = sos.obj_file_id AND
sos.sym_type = 0 AND sos.obj_file_id = so_needed.obj_file_id AND so_needed.id = so_needed_res.so_needed_id AND
so_needed_res.res_type = 1 AND so_needed_res.dep_obj_file_id = tpf.id AND
(tos.obj_file_id = tpf.id OR tos.obj_file_id = tpf.link_to_file_id) AND
tos.sym_type = 1 AND tos.name = sos.name
""", [repodir_id])
print 'Searching symbols resolutions (2)...'
dbc.execute("""
INSERT INTO obj_symbols_res(obj_sym_id, dep_obj_sym_id, res_type)
SELECT sos.id, tos.id, 2 FROM packages CROSS JOIN package_files CROSS JOIN obj_symbols sos CROSS JOIN
so_needed CROSS JOIN so_needed_res CROSS JOIN obj_symbols tos
WHERE packages.repodir_id = ? AND packages.id = package_files.package_id AND package_files.id = sos.obj_file_id AND
sos.sym_type = 0 AND sos.obj_file_id = so_needed.obj_file_id AND so_needed.id = so_needed_res.so_needed_id AND
so_needed_res.res_type = 2 AND so_needed_res.dep_obj_file_id = tos.obj_file_id AND
tos.sym_type = 1 AND tos.name = sos.name
""", [repodir_id])
print 'Searching symbols resolutions (3)...'
search_repodirs = [repodir_id]
search_repodirs.extend(repodir_depends)
in_repodirs = ','.join(str(id) for id in search_repodirs)
dbc.execute("""
INSERT INTO obj_symbols_res(obj_sym_id, dep_obj_sym_id, res_type)
SELECT sos.id, tos.id, 3 FROM packages CROSS JOIN package_files CROSS JOIN obj_symbols sos CROSS JOIN
obj_symbols tos CROSS JOIN package_files tpf
WHERE repodir_id = ? AND packages.id = package_files.package_id AND package_files.id = sos.obj_file_id AND
sos.sym_type = 0 AND NOT EXISTS (SELECT 1 FROM obj_symbols_res WHERE obj_sym_id = sos.id) AND
sos.name = tos.name AND tos.sym_type = 1 AND tos.obj_file_id = tpf.id AND
tpf.package_id IN (SELECT id FROM packages WHERE repodir_id IN (%s))
""" % in_repodirs, [repodir_id])
def process_repodir(dbc, repo_id, repo_name, repo_sources, depend_repodir_list, repodirs_processed, dep_arch):
all_depends_ready = True
repodir_depends = []
in_repodirs = ','.join(str(id) for id in repodirs_processed)
for dr_name in depend_repodir_list:
repodir_depend_found = dbc.execute("""
SELECT id, name FROM repodirs WHERE id IN (%s) AND name = ?
""" % in_repodirs, [dr_name]).fetchall()
if len(repodir_depend_found) == 0:
all_depends_ready = False
break
else:
for rdf in repodir_depend_found:
repodir_depends.append(rdf[0])
if not all_depends_ready:
return False
print repo_name, ' ', depend_repodir_list, ' ', dep_arch
process_repodir_requires(dbc, repo_id, repo_name, repodir_depends, dep_arch)
process_repodir_file_links(dbc, repo_id, repo_name, repodir_depends)
process_repodir_so_needed(dbc, repo_id, repo_name, repodir_depends)
process_repodir_obj_symbols(dbc, repo_id, repo_name, repodir_depends)
if repo_sources:
print 'Searching source rpms...'
dbc.execute("""
UPDATE packages SET sourcerpm_package = NULL
WHERE repodir_id = ?""", [repo_id])
dbc.execute("""
UPDATE packages SET sourcerpm_package =
(SELECT id FROM packages ps
WHERE repodir_id IN (SELECT id FROM repodirs WHERE name = ?) AND
ps.nvra = substr(packages.sourcerpm, 1, length(packages.sourcerpm)-4)
)
WHERE repodir_id = ? AND sourcerpm LIKE '%.rpm'
""", [repo_sources, repo_id])
return True
def main(args):
conn = sqlite3.connect(DB)
dbc = conn.cursor()
global n
n = 0
dbc.execute("""
PRAGMA cache_size = -1048576
""")
dbc.execute("""
DELETE FROM so_needed_res""")
dbc.execute("""
DELETE FROM obj_symbols_res""")
dbc.execute("""
DELETE FROM package_depend_res""")
dbc.execute("""
ANALYZE""")
repodirs_processed = []
#Process binary rpms
repodirs_processed_cnt = -1
while repodirs_processed_cnt < len(repodirs_processed):
in_repodirs = ','.join(str(id) for id in repodirs_processed)
repodirs = dbc.execute("""
SELECT id, name, sources, path FROM repodirs WHERE sources <> '.' AND id NOT IN (%s)
""" % in_repodirs).fetchall()
for repodir in repodirs:
(repo_id, repo_name, repo_sources) = (repodir[0], repodir[1], repodir[2])
depend_repodir_names = dbc.execute(
"""
SELECT depend_repodir_name FROM repodir_depends WHERE repodir_id = ?
""", [repo_id]).fetchall()
depend_repodir_list = [drn[0] for drn in depend_repodir_names]
if process_repodir(dbc, repo_id, repo_name, repo_sources, depend_repodir_list, repodirs_processed, None):
repodirs_processed.append(repo_id)
repodirs_processed_cnt = len(repodirs_processed)
#Process SRPMS
repodirs_processed_cnt = -1
while repodirs_processed_cnt < len(repodirs_processed):
repodirs = dbc.execute("""
SELECT id, name, sources, path FROM repodirs WHERE sources = '.'
""").fetchall()
for repodir in repodirs:
(repo_id, repo_name, repo_sources) = (repodir[0], repodir[1], repodir[2])
src_build_archs = []
depend_repodir_names = dbc.execute(
"""
SELECT depend_repodir_name FROM repodir_depends WHERE repodir_id = ?
""", [repo_id]).fetchall()
for drn in depend_repodir_names:
dr_name = drn[0]
if '$arch' in dr_name:
depend_repodir_found = dbc.execute(
"""
SELECT id, name FROM repodirs WHERE name LIKE ?
""", [dr_name.replace('$arch', '%')]).fetchall()
if len(depend_repodir_found) == 0:
raise Exception('Dependancy repositories not found!')
for drf in depend_repodir_found:
arch = extract_arch(dr_name, drf[1])
if arch:
if arch == 'SRPMS':
continue
src_build_archs.append(arch)
else:
raise Exception('Source repository should depend on */$arch/* repo.')
for arch in src_build_archs:
depend_repodir_list = [drn[0].replace('$arch', arch)
for drn in depend_repodir_names]
if not process_repodir(dbc, repo_id, repo_name, None, depend_repodir_list, repodirs_processed, arch):
raise Exception('Couldn\'t process SRPMS repository!')
repodirs_processed.append(repo_id)
repodirs_processed_cnt = len(repodirs_processed)
in_repodirs = ','.join(str(id) for id in repodirs_processed)
repodirs_not_processed = dbc.execute("""
SELECT id, name, sources, path FROM repodirs rd WHERE id NOT IN (%s)
""" % in_repodirs).fetchall()
if len(repodirs_not_processed) > 0:
print 'Repodirs not processed due to dependencies:'
for rdna in repodirs_not_processed:
print rdna[1]
dbc.execute("""
ANALYZE""")
conn.commit()
if __name__ == "__main__":
main(sys.argv)