mirror of
https://abf.rosa.ru/djam/repo-analyzer.git
synced 2025-02-23 10:02:54 +00:00
Merge fill-repodb and prepare-repodb
This commit is contained in:
parent
9fb8b87fba
commit
2a2544d3be
5 changed files with 909 additions and 908 deletions
8
README
8
README
|
@ -1,14 +1,12 @@
|
|||
Порядок использования скриптов:
|
||||
|
||||
1. Настроить структуру репозиториев и пути в repo-analyze-config.xml
|
||||
2. Заполнить базу данных информацией из репозиториев:
|
||||
fill-repodb.py repo-analyze-config.xml
|
||||
2. Заполнить базу данных информацией из репозиториев и подготовить её к анализу:
|
||||
prepare-repodb.py repo-analyze-config.xml
|
||||
* Для ускорения можно отключить обаботку .so и их символов ключами -O -S соответственно.
|
||||
** При использовании удалённых репозиториев рекомендуется применять кеш: -с cache-dir
|
||||
*** Cкрипт создаёт в текущем каталоге базу данных repo.db размером около 2 Гб (при полной обработке, для репозиториев Chrome).
|
||||
3. Подготовить базу данных к анализу:
|
||||
prepare-repodb.py
|
||||
4. Выполнить анализ/проверки:
|
||||
3. Выполнить анализ/проверки:
|
||||
analyze-repodb.py
|
||||
analyze-repo-redundancy.py i586kde.lst --repo rosa-dx-chrome-1.0/i586/main/release >i586-redundant.txt
|
||||
|
||||
|
|
|
@ -42,7 +42,7 @@ def main(args):
|
|||
for repo in options.repo[0]:
|
||||
print repo
|
||||
rid = c.execute("""
|
||||
SELECT id FROM repodirs WHERE name = ? OR path = ?
|
||||
SELECT id FROM repodirs WHERE name = ? OR url = ?
|
||||
""", [repo, repo]).fetchall()
|
||||
if not rid:
|
||||
print 'Repository "%s" not found.' % repo
|
||||
|
|
|
@ -31,7 +31,7 @@ def detect_broken_dependencies(dbc, dot_output):
|
|||
else:
|
||||
deps = all_broken[pkg_id]['deps']
|
||||
if deps is not None:
|
||||
for dep_id in deps:
|
||||
for dep_id in sorted(deps.keys()):
|
||||
if deps[dep_id]['build_arch'] == build_arch:
|
||||
chains = build_dep_chains(dep_id, current_repodir,
|
||||
all_broken, build_arch,
|
||||
|
@ -256,7 +256,7 @@ def analyze_partitioning(dbc):
|
|||
print '==='
|
||||
print 'Possible partitioning:'
|
||||
repodirs = dbc.execute("""
|
||||
SELECT id, name, sources, path FROM repodirs WHERE sources <> '.' ORDER BY id
|
||||
SELECT id, name, sources FROM repodirs WHERE sources <> '.' ORDER BY id
|
||||
""").fetchall()
|
||||
for repodir in repodirs:
|
||||
(rd_id, rd_name) = (repodir[0], repodir[1])
|
||||
|
@ -348,7 +348,7 @@ class query_output:
|
|||
def __init__(self, dbc):
|
||||
self.dbc = dbc
|
||||
self.repodirs = dbc.execute("""
|
||||
SELECT id, name, sources, path FROM repodirs ORDER BY id
|
||||
SELECT id, name, sources FROM repodirs ORDER BY id
|
||||
""").fetchall()
|
||||
for repodir in self.repodirs:
|
||||
(rd_id, rd_name) = (repodir[0], repodir[1])
|
||||
|
|
788
fill-repodb.py
788
fill-repodb.py
|
@ -1,788 +0,0 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import sys
|
||||
import gettext
|
||||
import argparse
|
||||
import sqlite3
|
||||
import rpm
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import subprocess
|
||||
import shutil
|
||||
import time
|
||||
import multiprocessing as mp
|
||||
import gc
|
||||
import urllib
|
||||
from urlparse import urlparse, urlunparse
|
||||
|
||||
gettext.install('urpm-tools')
|
||||
|
||||
DB = 'repo.db'
|
||||
|
||||
NUM_PROCESSES = 1 # number of CPU's (evaluated automatically)
|
||||
|
||||
RPMFILEMODE_DIRECTORY = 0x4000
|
||||
RPMFILEMODE_EXECUTE = 0111
|
||||
|
||||
def get_files(url, ext):
|
||||
filelist = []
|
||||
urlp = urlparse(url)
|
||||
if urlp.scheme in ['ftp', 'http', 'https']:
|
||||
return parse_index_html(wget_url(url, None), url, '.rpm')
|
||||
dir_list = os.listdir(url)
|
||||
for d in dir_list:
|
||||
if d.endswith(ext):
|
||||
filepath = os.path.normpath(os.path.join(url, d))
|
||||
filelist.append(filepath)
|
||||
return filelist
|
||||
|
||||
def parseargs():
|
||||
parser = argparse.ArgumentParser(description=_('extract packages metadata'
|
||||
' from RPM repositories'))
|
||||
parser.add_argument('config', metavar='config',
|
||||
help=_('path to repo-analyze-config.xml'))
|
||||
parser.add_argument('-c', '--cache-dir',
|
||||
help=_('path to cache directory'))
|
||||
parser.add_argument('-O', '--no-shared-objects', action='store_true',
|
||||
help=_('don\'t process shared objects'))
|
||||
parser.add_argument('-S', '--no-so-symbols', action='store_true',
|
||||
help=_('don\'t process shared object symbols'))
|
||||
opts = parser.parse_args()
|
||||
return opts
|
||||
|
||||
def to_string(rpm, tag, val):
|
||||
if type(val) == type([]):
|
||||
if not(val):
|
||||
return None
|
||||
try:
|
||||
return str(val).decode('utf-8')
|
||||
except:
|
||||
print >> sys.stderr, 'Invalid UTF-8 string!\n(%s:\n%s = "%s")\n' % \
|
||||
(rpm, tag, val)
|
||||
return str(val).decode('utf-8', 'replace')
|
||||
|
||||
def init_database(conn):
|
||||
conn.executescript("""
|
||||
CREATE TABLE repodirs(id INTEGER PRIMARY KEY NOT NULL,
|
||||
name TEXT UNIQUE, path TEXT, arch TEXT, sources TEXT);
|
||||
CREATE TABLE repodir_depends(id INTEGER PRIMARY KEY NOT NULL,
|
||||
repodir_id INTEGER, depend_repodir_name TEXT);
|
||||
CREATE TABLE IF NOT EXISTS package_files(id INTEGER PRIMARY KEY NOT NULL,
|
||||
package_id INTEGER NOT NULL, basename TEXT, path TEXT,
|
||||
size INTEGER, mode INTEGER,
|
||||
link_to_file_id INTEGER, link_to_path TEXT, mark TEXT);
|
||||
CREATE TABLE package_requires_res(id INTEGER PRIMARY KEY NOT NULL,
|
||||
package_id INTEGER, requires_id INTEGER,
|
||||
provides_id INTEGER, dep_package_id INTEGER);
|
||||
CREATE TABLE package_conflicts_res(id INTEGER PRIMARY KEY NOT NULL,
|
||||
package_id INTEGER, conflicts_id INTEGER,
|
||||
provides_id INTEGER, dep_package_id INTEGER);
|
||||
CREATE TABLE package_obsoletes_res(id INTEGER PRIMARY KEY NOT NULL,
|
||||
package_id INTEGER, obsoletes_id INTEGER,
|
||||
provides_id INTEGER, dep_package_id INTEGER);
|
||||
CREATE TABLE so_needed(id INTEGER PRIMARY KEY NOT NULL,
|
||||
obj_file_id INTEGER, name TEXT);
|
||||
CREATE TABLE so_needed_res(id INTEGER PRIMARY KEY NOT NULL,
|
||||
so_needed_id INTEGER, dep_obj_file_id INTEGER, res_type INTEGER);
|
||||
CREATE TABLE obj_symbols(id INTEGER PRIMARY KEY NOT NULL,
|
||||
obj_file_id INTEGER, name TEXT, sym_type INTEGER);
|
||||
CREATE TABLE obj_symbols_res(id INTEGER PRIMARY KEY NOT NULL,
|
||||
obj_sym_id INTEGER, dep_obj_sym_id INTEGER, res_type INTEGER);
|
||||
PRAGMA synchronous = OFF;
|
||||
PRAGMA journal_mode = OFF;
|
||||
""")
|
||||
|
||||
def index_database(conn):
|
||||
print 'Indexing the database...'
|
||||
conn.executescript("""
|
||||
CREATE INDEX rd_name ON repodirs(name);
|
||||
CREATE INDEX pkg_name ON packages(name);
|
||||
CREATE INDEX pkg_nvra ON packages(nvra);
|
||||
CREATE INDEX pkg_arch ON packages(arch);
|
||||
CREATE INDEX pkg_group ON packages(rpm_group);
|
||||
CREATE INDEX pkg_repodir ON packages(repodir_id);
|
||||
CREATE INDEX pkg_rq_pkg_req ON package_requires_res(package_id, requires_id);
|
||||
CREATE INDEX pkg_rq_pkg_prov ON package_requires_res(dep_package_id, provides_id);
|
||||
CREATE INDEX pkg_cf_pkg_conf ON package_conflicts_res(package_id, conflicts_id);
|
||||
CREATE INDEX pkg_cf_pkg_prov ON package_conflicts_res(dep_package_id, provides_id);
|
||||
CREATE INDEX pkg_ob_pkg_obs ON package_obsoletes_res(package_id, obsoletes_id);
|
||||
CREATE INDEX pkg_ob_pkg_prov ON package_obsoletes_res(dep_package_id, provides_id);
|
||||
CREATE INDEX pkg_file_pkg_id ON package_files(package_id);
|
||||
CREATE INDEX pkg_file_name ON package_files(basename);
|
||||
CREATE INDEX pkg_file_path ON package_files(path);
|
||||
CREATE INDEX pkg_file_mark ON package_files(mark);
|
||||
CREATE INDEX so_needed_obj_id ON so_needed(obj_file_id);
|
||||
CREATE INDEX so_needed_res_sn ON so_needed_res(so_needed_id);
|
||||
CREATE INDEX symbols_obj_name_type ON obj_symbols(obj_file_id, name, sym_type);
|
||||
CREATE INDEX symbols_name_type ON obj_symbols(name, sym_type);
|
||||
CREATE INDEX symbols_res_sym ON obj_symbols_res(obj_sym_id);
|
||||
""")
|
||||
dep_tables = ['rpm_requires', 'rpm_provides',
|
||||
'rpm_conflicts', 'rpm_obsoletes']
|
||||
for table in dep_tables:
|
||||
conn.execute('CREATE INDEX %(tbl)s_pkg ON %(tbl)s(package_id)' %
|
||||
{'tbl': table})
|
||||
conn.execute('CREATE INDEX %(tbl)s_name ON %(tbl)s(name)' %
|
||||
{'tbl': table})
|
||||
conn.commit()
|
||||
|
||||
def add_repodir(xrepodir, conn):
|
||||
dbc = conn.cursor()
|
||||
dbc.execute("""
|
||||
INSERT INTO repodirs (name, path, sources) VALUES (?, ?, ?)
|
||||
""", [xrepodir.get('name'), xrepodir.get('path'), xrepodir.get('sources')])
|
||||
repodir_id = dbc.lastrowid
|
||||
for depend in xrepodir.findall('dependency'):
|
||||
dbc.execute("""
|
||||
INSERT INTO repodir_depends(repodir_id, depend_repodir_name) VALUES (?, ?)
|
||||
""", [repodir_id, depend.text.strip()])
|
||||
conn.commit()
|
||||
return repodir_id
|
||||
|
||||
def get_build_archs(xrepodir, xrepodirs):
|
||||
build_archs = []
|
||||
for depend in xrepodir.findall('dependency'):
|
||||
arch_sign = '$arch'
|
||||
depend_repo = depend.text.strip()
|
||||
spos = depend_repo.find(arch_sign)
|
||||
if spos >= 0:
|
||||
drepo_prefix = depend_repo[:spos]
|
||||
drepo_postfix = depend_repo[spos + len(arch_sign):]
|
||||
for xrepodir in xrepodirs.findall('dir'):
|
||||
repo_name = xrepodir.get('name')
|
||||
if repo_name.startswith(drepo_prefix) and \
|
||||
repo_name.endswith(drepo_postfix):
|
||||
repo_arch = repo_name[len(drepo_prefix) :
|
||||
len(repo_name) - len(drepo_postfix)]
|
||||
if repo_arch == 'SRPMS':
|
||||
continue
|
||||
if repo_arch not in build_archs:
|
||||
build_archs.append(repo_arch)
|
||||
if build_archs:
|
||||
return build_archs
|
||||
return [None]
|
||||
|
||||
def get_rpm_header(rpm_ts, pkg):
|
||||
hdr = None
|
||||
try:
|
||||
fdno = os.open(pkg, os.O_RDONLY)
|
||||
except OSError as exc:
|
||||
raise Exception('Unable to open file %s.\n%s' % (pkg, exc))
|
||||
try:
|
||||
hdr = rpm_ts.hdrFromFdno(fdno)
|
||||
except rpm.error as exc:
|
||||
raise Exception('Unable to read RPM header for %s\n%s.' % (pkg, exc))
|
||||
finally:
|
||||
os.close(fdno)
|
||||
return hdr
|
||||
|
||||
def generate_new_id(generator, gen_lock):
|
||||
gen_lock.acquire()
|
||||
last_id = generator.value
|
||||
last_id += 1
|
||||
generator.value = last_id
|
||||
gen_lock.release()
|
||||
return last_id
|
||||
|
||||
FILE_REC_ID_IDX = 0
|
||||
FILE_REC_PATH_IDX = 3
|
||||
FILE_REC_LINK_IDX = 6
|
||||
FILE_REC_MARK_IDX = 7
|
||||
|
||||
def register_object(data, object_file_record, temp_dir, no_so_symbols):
|
||||
so_needed = data['so_needed']
|
||||
obj_symbols = data['obj_symbols']
|
||||
obj_id = object_file_record[0]
|
||||
obj_file_path = object_file_record[3]
|
||||
temp_obj_file = os.path.join(temp_dir, obj_file_path.lstrip('/'))
|
||||
|
||||
target_file = None
|
||||
file_mark = None
|
||||
od_out = ''
|
||||
nmundef_out = ''
|
||||
nmdef_out = ''
|
||||
if os.path.islink(temp_obj_file):
|
||||
target_file = os.path.join(os.path.dirname(obj_file_path),
|
||||
os.readlink(temp_obj_file))
|
||||
file_mark = 'link'
|
||||
elif not os.path.exists(temp_obj_file):
|
||||
file_mark = 'not-found'
|
||||
else:
|
||||
p = subprocess.Popen(['objdump', '-p', temp_obj_file],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
od_out = p.communicate()[0]
|
||||
if p.returncode != 0:
|
||||
file_mark = 'invalid-format'
|
||||
elif not(no_so_symbols):
|
||||
p = subprocess.Popen(['nm', '-p', '-D', '--undefined-only',
|
||||
temp_obj_file],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
nmundef_out = p.communicate()[0]
|
||||
if p.returncode != 0:
|
||||
file_mark = 'no-symbols'
|
||||
else:
|
||||
p = subprocess.Popen(['nm', '-p', '-D', '--defined-only',
|
||||
temp_obj_file],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
nmdef_out = p.communicate()[0]
|
||||
if p.returncode != 0:
|
||||
file_mark = 'no-symbols'
|
||||
else:
|
||||
file_mark = 'so'
|
||||
|
||||
object_file_record[FILE_REC_LINK_IDX] = target_file
|
||||
object_file_record[FILE_REC_MARK_IDX] = file_mark
|
||||
|
||||
dynsection = False
|
||||
for odline in od_out.split('\n'):
|
||||
odls = odline.strip()
|
||||
if odls == '':
|
||||
dynsection = False
|
||||
elif odls == 'Динамический раздел:' or odls == 'Dynamic section:':
|
||||
dynsection = True
|
||||
elif dynsection:
|
||||
needrem = re.match(r'\s+NEEDED\s+(.*)', odline)
|
||||
if needrem:
|
||||
so_needed.append([obj_id, needrem.group(1)])
|
||||
|
||||
for symline in nmundef_out.split('\n'):
|
||||
smre = re.match(r'^.([\S]*)\s+(\w)\s(.*)$', symline)
|
||||
if smre:
|
||||
if smre.group(2) in ['v', 'w']:
|
||||
continue
|
||||
symname = smre.group(3)
|
||||
obj_symbols.append([obj_id, symname, 0])
|
||||
|
||||
for symline in nmdef_out.split('\n'):
|
||||
smre = re.match(r'^.([\S]*)\s+(\w)\s(.*)$', symline)
|
||||
if smre:
|
||||
symname = smre.group(3)
|
||||
obj_symbols.append([obj_id, symname, 1])
|
||||
|
||||
return obj_id
|
||||
|
||||
def extract_files(pkg, files_list, obj_so_files_idx, temp_dir):
|
||||
#local_pkg = getLocalPackageName(pkg)
|
||||
local_pkg = pkg
|
||||
filelist = os.path.join(temp_dir, 'files.lst')
|
||||
with open(filelist, 'w') as f:
|
||||
for i in obj_so_files_idx:
|
||||
f.write('.' + files_list[i][FILE_REC_PATH_IDX] + '\n')
|
||||
|
||||
rpm_cpio_cmd = 'rpm2cpio ' + local_pkg + ' | cpio -ivdu -E ' + filelist
|
||||
p = subprocess.Popen(rpm_cpio_cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
cwd=temp_dir,
|
||||
shell=True)
|
||||
output = p.communicate()[0]
|
||||
if p.returncode != 0:
|
||||
print >> sys.stderr, 'Couldn\'t extract files from package %s.' \
|
||||
'\n\t%s' % (pkg, output)
|
||||
return False
|
||||
return True
|
||||
|
||||
def process_package_worker(num, queue_in, generator, gen_lock, db_struct,
|
||||
repodir_id, build_archs, temp_dir,
|
||||
no_shared_objects, no_so_symbols):
|
||||
|
||||
rpm_ts = rpm.TransactionSet()
|
||||
rpm_ts.setVSFlags(~(rpm.RPMVSF_NEEDPAYLOAD))
|
||||
data = {}
|
||||
data['packages'] = []
|
||||
for table in db_struct['dep_tables']:
|
||||
data[table] = []
|
||||
data['package_files'] = []
|
||||
data['so_needed'] = []
|
||||
data['obj_symbols'] = []
|
||||
|
||||
while True:
|
||||
job = queue_in.get()
|
||||
if job is None:
|
||||
break
|
||||
(pkg, ) = job
|
||||
|
||||
pkg_id = generate_new_id(generator, gen_lock)
|
||||
local_pkg = get_local_file(pkg, temp_dir)
|
||||
|
||||
hdr = get_rpm_header(rpm_ts, local_pkg)
|
||||
package_values = []
|
||||
package_values.append(pkg_id)
|
||||
|
||||
for tag in db_struct['packages_tags']:
|
||||
hval = hdr[tag]
|
||||
package_values.append(
|
||||
(sqlite3.Binary(hval) if len(hval)>0 else None)
|
||||
if tag in db_struct['blob_tags'] else \
|
||||
to_string(pkg, tag, hval) if \
|
||||
type(hval) in [type([]), type('')] else hval
|
||||
)
|
||||
package_values.append(repodir_id)
|
||||
package_values.append(pkg)
|
||||
package_values.append(None)
|
||||
data['packages'].append(package_values)
|
||||
for table in db_struct['dep_tables']:
|
||||
table_data = data[table]
|
||||
rpref = 'RPMTAG_' + table[4 : -1].upper() # rpm_requires
|
||||
(dep_name, dep_flags, dep_version) = \
|
||||
(hdr[rpref + 'NAME'], hdr[rpref + 'FLAGS'], hdr[rpref + 'VERSION'])
|
||||
for i in xrange(0, len(hdr[rpref + 'NAME'])):
|
||||
for build_arch in build_archs:
|
||||
table_data.append([dep_name[i].decode('utf-8'),
|
||||
dep_flags[i],
|
||||
dep_version[i],
|
||||
pkg_id, build_arch])
|
||||
(pkg_file_paths, pkg_file_names, pkg_file_sizes, pkg_file_modes) = \
|
||||
(hdr['RPMTAG_FILEPATHS'], hdr['RPMTAG_BASENAMES'],
|
||||
hdr['RPMTAG_FILESIZES'], hdr['RPMTAG_FILEMODES'])
|
||||
files_list = data['package_files']
|
||||
files_dirs = {}
|
||||
obj_so_files_idx = []
|
||||
for i in xrange(0, len(pkg_file_paths)):
|
||||
file_name = pkg_file_names[i]
|
||||
file_path = pkg_file_paths[i]
|
||||
pkg_file_id = generate_new_id(generator, gen_lock)
|
||||
files_list.append([pkg_file_id, #FILE_REC_ID_IDX = 0
|
||||
pkg_id,
|
||||
file_name.decode('utf-8'),
|
||||
file_path.decode('utf-8'), #FILE_REC_PATH_IDX = 3
|
||||
pkg_file_sizes[i],
|
||||
pkg_file_modes[i],
|
||||
None, #link_to_path FILE_REC_LINK_IDX = 6
|
||||
None #mark FILE_REC_LINK_IDX = 7
|
||||
])
|
||||
if pkg_file_modes[i] & RPMFILEMODE_DIRECTORY != 0:
|
||||
files_dirs[file_path] = False
|
||||
continue
|
||||
dir_name = os.path.dirname(file_path)
|
||||
if dir_name != '' and dir_name not in files_dirs:
|
||||
files_dirs[dir_name] = True
|
||||
if no_shared_objects:
|
||||
continue
|
||||
if os.path.splitext(file_name)[1] in \
|
||||
['.debug', '.xz', '.conf', '.py', '.c', '.h', '.hpp', '.png',
|
||||
'.cc', '.cpp', '.sh', '.java', '.pl', '.patch', '.desktop']:
|
||||
continue
|
||||
if file_path.startswith('/usr/lib/debug/.build-id') or \
|
||||
file_path.endswith('/ld.so.cache'):
|
||||
continue
|
||||
if re.search(r'\.so($|\.)', file_name) or \
|
||||
(pkg_file_modes[i] & RPMFILEMODE_EXECUTE) != 0:
|
||||
obj_so_files_idx.append(len(files_list) - 1)
|
||||
|
||||
for fdir in sorted(files_dirs.keys()):
|
||||
if files_dirs[fdir]:
|
||||
# Add parent directories as implicit files
|
||||
# TODO: recursive processing?
|
||||
pkg_file_id = generate_new_id(generator, gen_lock)
|
||||
files_list.append([pkg_file_id, #FILE_REC_ID_IDX = 0
|
||||
pkg_id,
|
||||
os.path.basename(fdir),
|
||||
fdir, #FILE_REC_PATH_IDX = 3
|
||||
0,
|
||||
-1, # special mode
|
||||
None, #link_to_path FILE_REC_LINK_IDX = 6
|
||||
None #mark FILE_REC_LINK_IDX = 7
|
||||
])
|
||||
|
||||
if obj_so_files_idx:
|
||||
pkg_temp_dir = os.path.join(temp_dir, os.path.basename(local_pkg))
|
||||
os.makedirs(pkg_temp_dir)
|
||||
if extract_files(local_pkg, files_list,
|
||||
obj_so_files_idx, pkg_temp_dir):
|
||||
for i in obj_so_files_idx:
|
||||
register_object(data, files_list[i], pkg_temp_dir,
|
||||
no_so_symbols)
|
||||
|
||||
shutil.rmtree(pkg_temp_dir, True)
|
||||
|
||||
remove_cached_file(pkg)
|
||||
queue_in.task_done()
|
||||
|
||||
conn = sqlite3.connect(DB, timeout=30)
|
||||
conn.executemany("""
|
||||
INSERT INTO packages (%s) VALUES (%s)""" %
|
||||
(db_struct['packages_field_names'],
|
||||
db_struct['packages_values_template']),
|
||||
data['packages'])
|
||||
|
||||
for table in db_struct['dep_tables']:
|
||||
conn.executemany("""
|
||||
INSERT INTO %s (name, flags, version, package_id, build_arch)
|
||||
VALUES (?, ?, ?, ?, ?)""" % table, data[table])
|
||||
|
||||
conn.executemany("""
|
||||
INSERT INTO package_files (id, package_id, basename, path, size, mode, link_to_path, mark)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", data['package_files'])
|
||||
|
||||
conn.executemany("""
|
||||
INSERT INTO so_needed(obj_file_id, name) VALUES(?, ?)
|
||||
""", data['so_needed'])
|
||||
|
||||
conn.executemany("""
|
||||
INSERT INTO obj_symbols(obj_file_id, name, sym_type) VALUES(?, ?, ?)
|
||||
""", data['obj_symbols'])
|
||||
|
||||
conn.commit()
|
||||
queue_in.task_done()
|
||||
|
||||
local_cache = {}
|
||||
def get_local_file(url, temp_dir):
|
||||
urlp = urlparse(url)
|
||||
if urlp.scheme in ['ftp', 'http', 'https']:
|
||||
cached_file_name = local_cache.get(url)
|
||||
if cached_file_name and os.path.isfile(cached_file_name):
|
||||
return cached_file_name
|
||||
cache_dir = os.path.join(temp_dir, 'cache')
|
||||
if not os.path.isdir(cache_dir):
|
||||
os.makedirs(cache_dir)
|
||||
temp_file = os.path.join(cache_dir, os.path.basename(url))
|
||||
wget_url(url, temp_file)
|
||||
local_cache[url] = temp_file
|
||||
return temp_file
|
||||
return url
|
||||
|
||||
def remove_cached_file(url):
|
||||
cached_file_name = local_cache.get(url)
|
||||
if cached_file_name:
|
||||
os.unlink(cached_file_name)
|
||||
del local_cache[url]
|
||||
|
||||
def wget_url(url, target_file):
|
||||
urlp = urlparse(url)
|
||||
wget_params = []
|
||||
site = urlp.netloc
|
||||
if urlp.username:
|
||||
wget_params = wget_params + ['--auth-no-challenge',
|
||||
'--http-user=%s' % urlp.username,
|
||||
'--http-password=%s' %
|
||||
('""' if not urlp.password else urlp.password)]
|
||||
site = site[site.find('@') + 1:]
|
||||
url = urlunparse((urlp.scheme, site, urlp.path, urlp.params,
|
||||
urlp.query, urlp.fragment))
|
||||
print 'Downloading %s...' % url
|
||||
if target_file is None:
|
||||
wget_params += ['-nv', '-O-', url]
|
||||
else:
|
||||
wget_params += ['-nv', '-O', target_file, url]
|
||||
p = subprocess.Popen(['wget'] + wget_params,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
wget_out, wget_err = p.communicate()
|
||||
if p.returncode != 0:
|
||||
print >> sys.stderr, ('Unable to get data from the url: %s '
|
||||
'(error: %d).\n%s\n%s') % \
|
||||
(url, p.returncode, wget_out, wget_err)
|
||||
raise Exception('Unable to download data (%d).' % p.returncode)
|
||||
if target_file is None:
|
||||
return wget_out
|
||||
|
||||
def parse_index_html(index_html, base_url, filter_ext):
|
||||
file_list = []
|
||||
for match in re.finditer(r'href="([^"]+)"', index_html, re.M):
|
||||
filename = match.group(1)
|
||||
if filename.endswith(filter_ext):
|
||||
if '://' in filename[:8]:
|
||||
file_list.append(filename)
|
||||
continue
|
||||
filepath = os.path.join(base_url, filename)
|
||||
if os.path.dirname(filepath) == base_url.rstrip('/') and \
|
||||
os.path.basename(filepath) == filename:
|
||||
file_list.append(filepath)
|
||||
return file_list
|
||||
|
||||
def download_repodir(source_urlp, cache_dir):
|
||||
site = source_urlp.netloc
|
||||
site = site[site.find('@') + 1:]
|
||||
target_dir = os.path.join(cache_dir,
|
||||
site,
|
||||
source_urlp.path.lstrip('/'))
|
||||
if not os.path.isdir(target_dir):
|
||||
os.makedirs(target_dir)
|
||||
remote_files = {}
|
||||
if source_urlp.scheme in ['ftp', 'http', 'https']:
|
||||
source_url = source_urlp.geturl()
|
||||
remote_dir_contents = parse_index_html(wget_url(source_url, None),
|
||||
source_url, '.rpm')
|
||||
for remote_file in remote_dir_contents:
|
||||
remote_filename = urllib.unquote(os.path.basename(remote_file))
|
||||
remote_files[remote_filename] = True
|
||||
target_file = os.path.join(target_dir, remote_filename)
|
||||
if os.path.isfile(target_file):
|
||||
continue
|
||||
wget_url(remote_file, target_file)
|
||||
|
||||
for local_filename in os.listdir(target_dir):
|
||||
if local_filename not in remote_files and \
|
||||
local_filename.endswith('.rpm'):
|
||||
print 'Removing local file: %s.' % local_filename
|
||||
os.unlink(os.path.join(target_dir, local_filename))
|
||||
|
||||
return target_dir
|
||||
|
||||
def urpm_get_packages(media):
|
||||
extra_params = []
|
||||
if not media.endswith(' update'):
|
||||
extra_params = ['--exclude-media', media + ' update']
|
||||
p = subprocess.Popen(['urpmq', '-r', '--ignorearch',
|
||||
'--list', '--media', media] +
|
||||
extra_params,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
urpmqr_out, urpmqr_err = p.communicate()
|
||||
if p.returncode != 0 or len(urpmqr_err) > 0:
|
||||
print >> sys.stderr, ('Unable to get a list of packages '
|
||||
'from the media: %s.\n'
|
||||
'%s\n%s') % (media, urpmqr_out, urpmq_err)
|
||||
raise Exception('Unable to get a list of packages (%d).' % p.returncode)
|
||||
# urpmi --no-install --allow-nodeps --force
|
||||
# --download-all=/tmp/ xine-wavpack-1.2.4-1plf --media Desktop2012.1-8
|
||||
p = subprocess.Popen(['urpmq', '-f', '--ignorearch',
|
||||
'--list', '--media', media] +
|
||||
extra_params,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
urpmqf_out, urpmqf_err = p.communicate()
|
||||
if p.returncode != 0 or len(urpmqf_err) > 0:
|
||||
print >> sys.stderr, ('Unable to get a list of packages '
|
||||
'from the media: %s.\n'
|
||||
'%s\n%s') % (media, urpmqf_out, urpmqf_err)
|
||||
raise Exception('Unable to get a list of packages (%d).' % p.returncode)
|
||||
|
||||
rpm_list = []
|
||||
qr_lines = urpmqr_out.split('\n')
|
||||
qf_lines = urpmqf_out.split('\n')
|
||||
if len(qr_lines) != len(qf_lines):
|
||||
raise Exception('Not consistent urpmq -r and urpmq -f outputs '
|
||||
'(%d and %d lines).' % (len(qr_lines), len(qf_lines)))
|
||||
for i in xrange(0, len(qf_lines)):
|
||||
qf_line = qf_lines[i]
|
||||
if qf_line.strip() == '':
|
||||
continue
|
||||
if not qf_line.startswith(qr_lines[i]):
|
||||
raise Exception('Not consistent urpmq -r and urpmq -f outputs: '
|
||||
'%s and %s' % (qr_lines[i], qf_line))
|
||||
rpm_list.append('urpm://%s/%s.rpm#%s' % (urllib.quote(media),
|
||||
urllib.quote(qf_line),
|
||||
urllib.quote(qr_lines[i])))
|
||||
return rpm_list
|
||||
|
||||
|
||||
def get_urpmi(urpm_package, target_dir):
|
||||
urlp = urlparse(urpm_package)
|
||||
package_name = urllib.unquote(urlp.fragment)
|
||||
print package_name
|
||||
p = subprocess.Popen(['urpmi', '--no-install',
|
||||
'--force', '--no-suggests',
|
||||
'--allow-nodeps',
|
||||
'--no-download-all',
|
||||
'--media', urlp.netloc,
|
||||
package_name])
|
||||
#stdout=subprocess.PIPE,
|
||||
#stderr=subprocess.PIPE)
|
||||
urpmi_out, urpmi_err = p.communicate()
|
||||
if p.returncode != 0:
|
||||
print >> sys.stderr, ('Unable to get the package %s '
|
||||
'from the media %s.\n'
|
||||
'%s\n%s') % (
|
||||
package_name, urlp.netloc,
|
||||
urpmi_out, urpmi_err)
|
||||
raise Exception('Unable to get the package %s (%d).' %
|
||||
(package_name, p.returncode))
|
||||
|
||||
|
||||
def urpm_get_repodir(repodir_name, cache_dir):
|
||||
target_dir = os.path.join(cache_dir,
|
||||
repodir_name,
|
||||
'rpms')
|
||||
if not os.path.isdir(target_dir):
|
||||
os.makedirs(target_dir)
|
||||
urpm_files = {}
|
||||
urpm_media_contents = urpm_get_packages(repodir_name)
|
||||
for urpm_package in urpm_media_contents:
|
||||
remote_filename = urllib.unquote(os.path.basename(urpm_package))
|
||||
target_file = os.path.join(target_dir, remote_filename)
|
||||
get_urpmi(urpm_package, os.path.join(cache_dir,
|
||||
repodir_name))
|
||||
print target_file
|
||||
raise Exception('Not implemented.')
|
||||
|
||||
generator_value = 0
|
||||
|
||||
def process_repodir(xrepodir, repodir_id, cache_dir, build_archs, conn,
|
||||
db_struct, temp_dir, no_shared_objects, no_so_symbols):
|
||||
|
||||
repodir_url = xrepodir.get('url')
|
||||
urlp = urlparse(repodir_url)
|
||||
working_url = repodir_url
|
||||
if cache_dir is not None:
|
||||
if urlp.scheme in ['ftp', 'http', 'https']:
|
||||
working_url = download_repodir(urlp, cache_dir)
|
||||
elif urlp.scheme == 'urpm':
|
||||
working_url = urpm_get_repodir(xrepodir.get('name'), cache_dir)
|
||||
elif urlp.scheme not in ['', 'file']:
|
||||
raise Exception('Invalid scheme in the repository url: %s' %
|
||||
repodir_url)
|
||||
rpm_list = []
|
||||
rpm_list = get_files(working_url, '.rpm')
|
||||
if not rpm_list:
|
||||
return
|
||||
print urlp.netloc[urlp.netloc.find('@') + 1:] + urlp.path, ': ', \
|
||||
len(rpm_list)
|
||||
if not db_struct.get('defined'):
|
||||
rpm_ts = rpm.TransactionSet()
|
||||
rpm_ts.setVSFlags(~(rpm.RPMVSF_NEEDPAYLOAD))
|
||||
# ts.setVSFlags(~(rpm.RPMVSF_NOMD5|rpm.RPMVSF_NEEDPAYLOAD))
|
||||
hdr = get_rpm_header(rpm_ts, get_local_file(rpm_list[0], temp_dir))
|
||||
|
||||
# Retain sort order!
|
||||
packages_extra_fields = {'repodir_id': 'INTEGER',
|
||||
'rpm_url': 'TEXT',
|
||||
'sourcerpm_package': 'TEXT'}
|
||||
|
||||
file_tags_re = r'^RPMTAG_(BASENAMES|FILE[\w\d]+)'
|
||||
dir_tags_re = r'^RPMTAG_DIR(INDEXES|NAMES)'
|
||||
changelog_tags_re = r'^RPMTAG_CHANGELOG\w+'
|
||||
trigger_tags_re = r'^RPMTAG_TRIGGER\w+'
|
||||
|
||||
datetime_tags = ['RPMTAG_PACKAGETIME', 'RPMTAG_RPMLIBTIMESTAMP', ]
|
||||
db_struct['blob_tags'] = ['RPMTAG_RSAHEADER', 'RPMTAG_DSAHEADER',
|
||||
'RPMTAG_HEADERIMMUTABLE', 'RPMTAG_SIGMD5',
|
||||
'RPMTAG_PKGID', 'RPMTAG_SOURCEPKGID']
|
||||
|
||||
reserved_field_names = ['id', 'group']
|
||||
skip_tags_re = '^RPMTAG_(C|D|E|N|P|R|V|HEADERIMMUTABLE)$'
|
||||
#C - CONFLICTNAME, D - DISTEPOCH, E - EPOCH, N - NAME, O - OBSOLETENAME
|
||||
#P - PROVIDENAME, R - RELEASE, V - VERSION
|
||||
|
||||
types = {"<type 'str'>" : "TEXT", "<type 'int'>": "INTEGER",
|
||||
"<type 'NoneType'>": "TEXT", "<type 'list'>": "TEXT"}
|
||||
|
||||
dep_tags_re = r'^RPMTAG_(CONFLICT|OBSOLETE|PROVIDE|REQUIRE)\w+'
|
||||
|
||||
db_struct['dep_tables'] = ['rpm_requires', 'rpm_provides',
|
||||
'rpm_conflicts', 'rpm_obsoletes']
|
||||
|
||||
packages_field_names = 'id, '
|
||||
packages_values_template = '?,'
|
||||
packages_tags = []
|
||||
packages_fields = ''
|
||||
|
||||
rpmtags = [str(t) for t in dir(rpm) if t.startswith('RPMTAG_') ]
|
||||
for tag in rpmtags:
|
||||
if (re.match(file_tags_re, tag) or re.match(dir_tags_re, tag) or
|
||||
re.match(changelog_tags_re, tag) or
|
||||
re.match(skip_tags_re, tag) or
|
||||
re.match(trigger_tags_re, tag) or
|
||||
re.match(dep_tags_re, tag)):
|
||||
continue
|
||||
sqltype = "TIMESTAMP" if tag in datetime_tags else \
|
||||
"BLOB" if tag in db_struct['blob_tags'] else \
|
||||
types[str(type(hdr[tag]))]
|
||||
fieldname = tag.replace('RPMTAG_', '').lower()
|
||||
if fieldname in reserved_field_names:
|
||||
fieldname = 'rpm_' + fieldname
|
||||
packages_tags.append(tag)
|
||||
packages_field_names += fieldname + ', '
|
||||
packages_values_template += '?, '
|
||||
packages_fields += fieldname + ' ' + sqltype + ', '
|
||||
nef = 0
|
||||
for extra_field in sorted(packages_extra_fields.keys()):
|
||||
packages_field_names += (', ' if nef > 0 else '') + extra_field
|
||||
packages_values_template += (', ' if nef > 0 else '') + '?'
|
||||
packages_fields += (', ' if nef > 0 else '') + extra_field + ' ' + \
|
||||
packages_extra_fields[extra_field]
|
||||
nef += 1
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS packages(id INTEGER PRIMARY KEY NOT NULL, %s)
|
||||
""" % (packages_fields))
|
||||
for table in db_struct['dep_tables']:
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY NOT NULL,
|
||||
name TEXT, flags INTEGER, version TEXT, build_arch TEXT,
|
||||
package_id INTEGER NOT NULL)""" % (table))
|
||||
conn.commit()
|
||||
db_struct['packages_tags'] = packages_tags
|
||||
db_struct['packages_field_names'] = packages_field_names
|
||||
db_struct['packages_values_template'] = packages_values_template
|
||||
db_struct['defined'] = True
|
||||
|
||||
|
||||
queue_in = mp.JoinableQueue()
|
||||
for pkg in rpm_list:
|
||||
queue_in.put((pkg, ))
|
||||
|
||||
for i in xrange(NUM_PROCESSES):
|
||||
queue_in.put(None)
|
||||
|
||||
# Trying to prevent Exception AssertionError: AssertionError() in
|
||||
# <Finalize object, dead> ignored
|
||||
gc.collect()
|
||||
time.sleep(1)
|
||||
gc.disable()
|
||||
global generator_value
|
||||
id_generator = mp.Value('i', generator_value)
|
||||
generator_lock = mp.Lock()
|
||||
# run workers
|
||||
workers = []
|
||||
for i in xrange(NUM_PROCESSES):
|
||||
worker = mp.Process(target = process_package_worker,
|
||||
args = (i, queue_in, id_generator,
|
||||
generator_lock, db_struct,
|
||||
repodir_id, build_archs, temp_dir,
|
||||
no_shared_objects, no_so_symbols))
|
||||
workers.append(worker)
|
||||
worker.start()
|
||||
queue_in.join()
|
||||
gc.enable()
|
||||
generator_value = id_generator.value
|
||||
|
||||
|
||||
def main(args):
|
||||
global NUM_PROCESSES
|
||||
|
||||
if os.path.exists(DB):
|
||||
os.unlink(DB)
|
||||
|
||||
if hasattr(os, "sysconf"):
|
||||
if os.sysconf_names.has_key("SC_NPROCESSORS_ONLN"):
|
||||
nproc = os.sysconf("SC_NPROCESSORS_ONLN")
|
||||
if isinstance(nproc, int) and nproc > 0:
|
||||
NUM_PROCESSES = nproc
|
||||
|
||||
conn = sqlite3.connect(DB)
|
||||
init_database(conn)
|
||||
conn.commit()
|
||||
|
||||
options = parseargs()
|
||||
parser = ET.XMLParser()
|
||||
tree = ET.parse(options.config, parser=parser)
|
||||
config_root = tree.getroot()
|
||||
temp_dir = '/dev/shm/rt-tmp/'
|
||||
shutil.rmtree(temp_dir, True)
|
||||
os.mkdir(temp_dir)
|
||||
rpm_db_struct = {}
|
||||
for xrepodir in config_root.find('repositories').findall('dir'):
|
||||
repodir_id = add_repodir(xrepodir, conn)
|
||||
build_archs = [None] if xrepodir.get('sources') != '.' else \
|
||||
get_build_archs(xrepodir,
|
||||
config_root.find('repositories'))
|
||||
process_repodir(xrepodir, repodir_id, options.cache_dir,
|
||||
build_archs, conn, rpm_db_struct, temp_dir,
|
||||
options.no_shared_objects, options.no_so_symbols)
|
||||
shutil.rmtree(temp_dir, True)
|
||||
if rpm_db_struct.get('defined'):
|
||||
index_database(conn)
|
||||
else:
|
||||
print 'Database was not initialized ' \
|
||||
'(check whether repositories are empty).'
|
||||
os.unlink(DB)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv)
|
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue