repo-analyzer/fill-repodb.py

736 lines
29 KiB
Python
Raw Normal View History

#!/usr/bin/python
2014-02-06 11:44:02 +04:00
# -*- coding: utf-8 -*-
import os
import sys
import gettext
import argparse
import sqlite3
import rpm
import re
import xml.etree.ElementTree as ET
import subprocess
import shutil
import time
import multiprocessing as mp
import gc
import urllib
from urlparse import urlparse, urlunparse
2014-02-06 11:44:02 +04:00
gettext.install('urpm-tools')
DB = 'repo.db'
NUM_PROCESSES = 1 # number of CPU's (evaluated automatically)
2014-02-06 11:44:02 +04:00
RPMFILEMODE_DIRECTORY = 0x4000
RPMFILEMODE_EXECUTE = 0111
def get_files(url, ext):
filelist = []
urlp = urlparse(url)
if urlp.scheme in ['http', 'https']:
return parse_index_html(wget_url(url, None), url, '.rpm')
dir_list = os.listdir(url)
2014-02-06 11:44:02 +04:00
for d in dir_list:
if d.endswith(ext):
filepath = os.path.normpath(os.path.join(url, d))
filelist.append(filepath)
2014-02-06 11:44:02 +04:00
return filelist
2014-02-07 15:02:11 +04:00
def parseargs():
2014-02-07 13:53:14 +04:00
parser = argparse.ArgumentParser(description=_('extract packages metadata'
' from RPM repositories'))
parser.add_argument('config', metavar='config',
2014-02-06 11:44:02 +04:00
help=_('path to repo-analyze-config.xml'))
parser.add_argument('-c', '--cache-dir',
help=_('path to cache directory'))
2014-02-07 13:53:14 +04:00
parser.add_argument('-O', '--no-shared-objects', action='store_true',
help=_('don\'t process shared objects'))
parser.add_argument('-S', '--no-so-symbols', action='store_true',
help=_('don\'t process shared object symbols'))
2014-02-06 11:44:02 +04:00
opts = parser.parse_args()
return opts
def to_string(rpm, tag, val):
if type(val) == type([]):
if not(val):
return None
try:
return str(val).decode('utf-8')
except:
print >> sys.stderr, 'Invalid UTF-8 string!\n(%s:\n%s = "%s")\n' % \
(rpm, tag, val)
return str(val).decode('utf-8', 'replace')
def init_database(conn):
conn.executescript("""
2014-02-06 11:44:02 +04:00
CREATE TABLE repodirs(id INTEGER PRIMARY KEY NOT NULL,
name TEXT UNIQUE, path TEXT, arch TEXT, sources TEXT);
2014-02-06 11:44:02 +04:00
CREATE TABLE repodir_depends(id INTEGER PRIMARY KEY NOT NULL,
repodir_id INTEGER, depend_repodir_name TEXT);
2014-02-06 11:44:02 +04:00
CREATE TABLE IF NOT EXISTS package_files(id INTEGER PRIMARY KEY NOT NULL,
package_id INTEGER NOT NULL, basename TEXT, path TEXT,
size INTEGER, mode INTEGER,
link_to_file_id INTEGER, link_to_path TEXT, mark TEXT);
CREATE TABLE package_requires_res(id INTEGER PRIMARY KEY NOT NULL,
package_id INTEGER, requires_id INTEGER,
provides_id INTEGER, dep_package_id INTEGER);
CREATE TABLE package_conflicts_res(id INTEGER PRIMARY KEY NOT NULL,
package_id INTEGER, conflicts_id INTEGER,
provides_id INTEGER, dep_package_id INTEGER);
CREATE TABLE package_obsoletes_res(id INTEGER PRIMARY KEY NOT NULL,
package_id INTEGER, obsoletes_id INTEGER,
provides_id INTEGER, dep_package_id INTEGER);
2014-02-06 11:44:02 +04:00
CREATE TABLE so_needed(id INTEGER PRIMARY KEY NOT NULL,
obj_file_id INTEGER, name TEXT);
2014-02-06 11:44:02 +04:00
CREATE TABLE so_needed_res(id INTEGER PRIMARY KEY NOT NULL,
so_needed_id INTEGER, dep_obj_file_id INTEGER, res_type INTEGER);
2014-02-06 11:44:02 +04:00
CREATE TABLE obj_symbols(id INTEGER PRIMARY KEY NOT NULL,
obj_file_id INTEGER, name TEXT, sym_type INTEGER);
2014-02-06 11:44:02 +04:00
CREATE TABLE obj_symbols_res(id INTEGER PRIMARY KEY NOT NULL,
obj_sym_id INTEGER, dep_obj_sym_id INTEGER, res_type INTEGER);
PRAGMA synchronous = OFF;
PRAGMA journal_mode = OFF;
""")
2014-02-06 11:44:02 +04:00
def index_database(conn):
print 'Indexing the database...'
conn.executescript("""
CREATE INDEX rd_name ON repodirs(name);
CREATE INDEX pkg_name ON packages(name);
CREATE INDEX pkg_nvra ON packages(nvra);
CREATE INDEX pkg_arch ON packages(arch);
CREATE INDEX pkg_group ON packages(rpm_group);
CREATE INDEX pkg_repodir ON packages(repodir_id);
CREATE INDEX pkg_rq_pkg_req ON package_requires_res(package_id, requires_id);
CREATE INDEX pkg_rq_pkg_prov ON package_requires_res(dep_package_id, provides_id);
CREATE INDEX pkg_cf_pkg_conf ON package_conflicts_res(package_id, conflicts_id);
CREATE INDEX pkg_cf_pkg_prov ON package_conflicts_res(dep_package_id, provides_id);
CREATE INDEX pkg_ob_pkg_obs ON package_obsoletes_res(package_id, obsoletes_id);
CREATE INDEX pkg_ob_pkg_prov ON package_obsoletes_res(dep_package_id, provides_id);
CREATE INDEX pkg_file_pkg_id ON package_files(package_id);
CREATE INDEX pkg_file_name ON package_files(basename);
CREATE INDEX pkg_file_path ON package_files(path);
CREATE INDEX pkg_file_mark ON package_files(mark);
CREATE INDEX so_needed_obj_id ON so_needed(obj_file_id);
CREATE INDEX so_needed_res_sn ON so_needed_res(so_needed_id);
CREATE INDEX symbols_obj_name_type ON obj_symbols(obj_file_id, name, sym_type);
CREATE INDEX symbols_name_type ON obj_symbols(name, sym_type);
CREATE INDEX symbols_res_sym ON obj_symbols_res(obj_sym_id);
""")
dep_tables = ['rpm_requires', 'rpm_provides',
'rpm_conflicts', 'rpm_obsoletes']
2014-02-06 11:44:02 +04:00
for table in dep_tables:
conn.execute('CREATE INDEX %(tbl)s_pkg ON %(tbl)s(package_id)' %
{'tbl': table})
conn.execute('CREATE INDEX %(tbl)s_name ON %(tbl)s(name)' %
{'tbl': table})
2014-02-06 11:44:02 +04:00
conn.commit()
def add_repodir(xrepodir, conn):
2014-02-07 15:02:11 +04:00
dbc = conn.cursor()
dbc.execute("""
INSERT INTO repodirs (name, path, sources) VALUES (?, ?, ?)
""", [xrepodir.get('name'), xrepodir.get('path'), xrepodir.get('sources')])
2014-02-07 15:02:11 +04:00
repodir_id = dbc.lastrowid
2014-02-06 11:44:02 +04:00
for depend in xrepodir.findall('dependency'):
dbc.execute("""
INSERT INTO repodir_depends(repodir_id, depend_repodir_name) VALUES (?, ?)
""", [repodir_id, depend.text.strip()])
2014-02-06 11:44:02 +04:00
conn.commit()
return repodir_id
def get_build_archs(xrepodir, xrepodirs):
build_archs = []
for depend in xrepodir.findall('dependency'):
arch_sign = '$arch'
depend_repo = depend.text.strip()
spos = depend_repo.find(arch_sign)
if spos >= 0:
drepo_prefix = depend_repo[:spos]
drepo_postfix = depend_repo[spos + len(arch_sign):]
for xrepodir in xrepodirs.findall('dir'):
repo_name = xrepodir.get('name')
if repo_name.startswith(drepo_prefix) and \
repo_name.endswith(drepo_postfix):
repo_arch = repo_name[len(drepo_prefix) :
len(repo_name) - len(drepo_postfix)]
if repo_arch == 'SRPMS':
continue
if repo_arch not in build_archs:
build_archs.append(repo_arch)
if build_archs:
return build_archs
return [None]
def get_rpm_header(rpm_ts, pkg):
hdr = None
try:
fdno = os.open(pkg, os.O_RDONLY)
2014-02-07 15:02:11 +04:00
except OSError as exc:
raise Exception('Unable to open file %s.\n%s' % (pkg, exc))
2014-02-06 11:44:02 +04:00
try:
hdr = rpm_ts.hdrFromFdno(fdno)
2014-02-07 15:02:11 +04:00
except rpm.error as exc:
raise Exception('Unable to read RPM header for %s\n%s.' % (pkg, exc))
2014-02-06 11:44:02 +04:00
finally:
os.close(fdno)
return hdr
def generate_new_id(generator, gen_lock):
gen_lock.acquire()
last_id = generator.value
last_id += 1
generator.value = last_id
gen_lock.release()
return last_id
FILE_REC_ID_IDX = 0
FILE_REC_PATH_IDX = 3
FILE_REC_LINK_IDX = 6
FILE_REC_MARK_IDX = 7
2014-02-07 15:02:11 +04:00
def register_object(data, object_file_record, temp_dir, no_so_symbols):
2014-02-06 11:44:02 +04:00
so_needed = data['so_needed']
obj_symbols = data['obj_symbols']
obj_id = object_file_record[0]
obj_file_path = object_file_record[3]
temp_obj_file = os.path.join(temp_dir, obj_file_path.lstrip('/'))
target_file = None
file_mark = None
od_out = ''
nmundef_out = ''
nmdef_out = ''
if os.path.islink(temp_obj_file):
target_file = os.path.join(os.path.dirname(obj_file_path),
os.readlink(temp_obj_file))
file_mark = 'link'
elif not os.path.exists(temp_obj_file):
file_mark = 'not-found'
else:
p = subprocess.Popen(['objdump', '-p', temp_obj_file],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
od_out = p.communicate()[0]
if p.returncode != 0:
file_mark = 'invalid-format'
2014-02-07 13:53:14 +04:00
elif not(no_so_symbols):
2014-02-06 11:44:02 +04:00
p = subprocess.Popen(['nm', '-p', '-D', '--undefined-only',
temp_obj_file],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
nmundef_out = p.communicate()[0]
if p.returncode != 0:
file_mark = 'no-symbols'
else:
p = subprocess.Popen(['nm', '-p', '-D', '--defined-only',
temp_obj_file],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
nmdef_out = p.communicate()[0]
if p.returncode != 0:
file_mark = 'no-symbols'
else:
file_mark = 'so'
object_file_record[FILE_REC_LINK_IDX] = target_file
object_file_record[FILE_REC_MARK_IDX] = file_mark
dynsection = False
for odline in od_out.split('\n'):
odls = odline.strip()
if odls == '':
dynsection = False
elif odls == 'Динамический раздел:' or odls == 'Dynamic section:':
dynsection = True
elif dynsection:
needrem = re.match(r'\s+NEEDED\s+(.*)', odline)
if needrem:
so_needed.append([obj_id, needrem.group(1)])
for symline in nmundef_out.split('\n'):
2014-02-07 15:02:11 +04:00
smre = re.match(r'^.([\S]*)\s+(\w)\s(.*)$', symline)
2014-02-06 11:44:02 +04:00
if smre:
if smre.group(2) in ['v', 'w']:
continue
symname = smre.group(3)
obj_symbols.append([obj_id, symname, 0])
for symline in nmdef_out.split('\n'):
2014-02-07 15:02:11 +04:00
smre = re.match(r'^.([\S]*)\s+(\w)\s(.*)$', symline)
2014-02-06 11:44:02 +04:00
if smre:
symname = smre.group(3)
obj_symbols.append([obj_id, symname, 1])
return obj_id
def extract_files(pkg, files_list, obj_so_files_idx, temp_dir):
#local_pkg = getLocalPackageName(pkg)
local_pkg = pkg
filelist = os.path.join(temp_dir, 'files.lst')
with open(filelist, 'w') as f:
for i in obj_so_files_idx:
f.write('.' + files_list[i][FILE_REC_PATH_IDX] + '\n')
2014-02-07 15:02:11 +04:00
rpm_cpio_cmd = 'rpm2cpio ' + local_pkg + ' | cpio -ivdu -E ' + filelist
2014-02-06 11:44:02 +04:00
p = subprocess.Popen(rpm_cpio_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
cwd=temp_dir,
shell=True)
output = p.communicate()[0]
if p.returncode != 0:
print >> sys.stderr, 'Couldn\'t extract files from package %s.' \
'\n\t%s' % (pkg, output)
return False
return True
2014-02-07 13:53:14 +04:00
def process_package_worker(num, queue_in, generator, gen_lock, db_struct,
repodir_id, build_archs, temp_dir,
no_shared_objects, no_so_symbols):
2014-02-06 11:44:02 +04:00
rpm_ts = rpm.TransactionSet()
rpm_ts.setVSFlags(~(rpm.RPMVSF_NEEDPAYLOAD))
data = {}
data['packages'] = []
for table in db_struct['dep_tables']:
data[table] = []
data['package_files'] = []
data['so_needed'] = []
data['obj_symbols'] = []
while True:
job = queue_in.get()
if job == None:
break
(pkg, ) = job
pkg_id = generate_new_id(generator, gen_lock)
local_pkg = get_local_file(pkg, temp_dir)
2014-02-06 11:44:02 +04:00
hdr = get_rpm_header(rpm_ts, local_pkg)
2014-02-06 11:44:02 +04:00
package_values = []
package_values.append(pkg_id)
for tag in db_struct['packages_tags']:
hval = hdr[tag]
package_values.append(
(sqlite3.Binary(hval) if len(hval)>0 else None)
if tag in db_struct['blob_tags'] else \
to_string(pkg, tag, hval) if \
type(hval) in [type([]), type('')] else hval
2014-02-06 11:44:02 +04:00
)
package_values.append(repodir_id)
package_values.append(pkg)
package_values.append(None)
data['packages'].append(package_values)
for table in db_struct['dep_tables']:
table_data = data[table]
rpref = 'RPMTAG_' + table[4 : -1].upper() # rpm_requires
2014-02-06 11:44:02 +04:00
(dep_name, dep_flags, dep_version) = \
(hdr[rpref + 'NAME'], hdr[rpref + 'FLAGS'], hdr[rpref + 'VERSION'])
for i in xrange(0, len(hdr[rpref + 'NAME'])):
for build_arch in build_archs:
table_data.append([dep_name[i].decode('utf-8'),
dep_flags[i],
dep_version[i],
pkg_id, build_arch])
(pkg_file_paths, pkg_file_names, pkg_file_sizes, pkg_file_modes) = \
(hdr['RPMTAG_FILEPATHS'], hdr['RPMTAG_BASENAMES'],
hdr['RPMTAG_FILESIZES'], hdr['RPMTAG_FILEMODES'])
2014-02-06 11:44:02 +04:00
files_list = data['package_files']
files_dirs = {}
2014-02-06 11:44:02 +04:00
obj_so_files_idx = []
for i in xrange(0, len(pkg_file_paths)):
file_name = pkg_file_names[i]
file_path = pkg_file_paths[i]
pkg_file_id = generate_new_id(generator, gen_lock)
files_list.append([pkg_file_id, #FILE_REC_ID_IDX = 0
pkg_id,
file_name.decode('utf-8'),
file_path.decode('utf-8'), #FILE_REC_PATH_IDX = 3
pkg_file_sizes[i],
pkg_file_modes[i],
None, #link_to_path FILE_REC_LINK_IDX = 6
None #mark FILE_REC_LINK_IDX = 7
])
if pkg_file_modes[i] & RPMFILEMODE_DIRECTORY != 0:
files_dirs[file_path] = False
2014-02-06 11:44:02 +04:00
continue
dir_name = os.path.dirname(file_path)
if dir_name != '' and dir_name not in files_dirs:
files_dirs[dir_name] = True
2014-02-07 13:53:14 +04:00
if no_shared_objects:
continue
2014-02-06 11:44:02 +04:00
if os.path.splitext(file_name)[1] in \
['.debug', '.xz', '.conf', '.py', '.c', '.h', '.hpp', '.png',
'.cc', '.cpp', '.sh', '.java', '.pl', '.patch', '.desktop']:
continue
2014-02-06 13:25:34 +04:00
if file_path.startswith('/usr/lib/debug/.build-id') or \
2014-02-06 11:44:02 +04:00
file_path.endswith('/ld.so.cache'):
continue
if re.search(r'\.so($|\.)', file_name) or \
(pkg_file_modes[i] & RPMFILEMODE_EXECUTE) != 0:
obj_so_files_idx.append(len(files_list) - 1)
for fdir in sorted(files_dirs.keys()):
if files_dirs[fdir]:
# Add parent directories as implicit files
# TODO: recursive processing?
pkg_file_id = generate_new_id(generator, gen_lock)
files_list.append([pkg_file_id, #FILE_REC_ID_IDX = 0
pkg_id,
os.path.basename(fdir),
fdir, #FILE_REC_PATH_IDX = 3
0,
-1, # special mode
None, #link_to_path FILE_REC_LINK_IDX = 6
None #mark FILE_REC_LINK_IDX = 7
])
2014-02-06 11:44:02 +04:00
if obj_so_files_idx:
pkg_temp_dir = os.path.join(temp_dir, os.path.basename(local_pkg))
2014-02-06 11:44:02 +04:00
os.makedirs(pkg_temp_dir)
if extract_files(local_pkg, files_list,
obj_so_files_idx, pkg_temp_dir):
2014-02-06 11:44:02 +04:00
for i in obj_so_files_idx:
2014-02-07 15:02:11 +04:00
register_object(data, files_list[i], pkg_temp_dir,
no_so_symbols)
2014-02-06 11:44:02 +04:00
shutil.rmtree(pkg_temp_dir, True)
remove_cached_file(pkg)
2014-02-06 11:44:02 +04:00
queue_in.task_done()
2014-02-07 11:41:03 +04:00
conn = sqlite3.connect(DB, timeout=30)
2014-02-06 11:44:02 +04:00
conn.executemany("""
INSERT INTO packages (%s) VALUES (%s)""" %
(db_struct['packages_field_names'],
db_struct['packages_values_template']),
data['packages'])
for table in db_struct['dep_tables']:
conn.executemany("""
INSERT INTO %s (name, flags, version, package_id, build_arch)
VALUES (?, ?, ?, ?, ?)""" % table, data[table])
conn.executemany("""
INSERT INTO package_files (id, package_id, basename, path, size, mode, link_to_path, mark)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", data['package_files'])
conn.executemany("""
INSERT INTO so_needed(obj_file_id, name) VALUES(?, ?)
""", data['so_needed'])
conn.executemany("""
INSERT INTO obj_symbols(obj_file_id, name, sym_type) VALUES(?, ?, ?)
""", data['obj_symbols'])
conn.commit()
queue_in.task_done()
local_cache = {}
def get_local_file(url, temp_dir):
urlp = urlparse(url)
if urlp.scheme in ['http', 'https']:
cached_file_name = local_cache.get(url)
if cached_file_name and os.path.isfile(cached_file_name):
return cached_file_name
cache_dir = os.path.join(temp_dir, 'cache')
if not os.path.isdir(cache_dir):
os.makedirs(cache_dir)
temp_file = os.path.join(cache_dir, os.path.basename(url))
wget_url(url, temp_file)
local_cache[url] = temp_file
return temp_file
return url
def remove_cached_file(url):
cached_file_name = local_cache.get(url)
if cached_file_name:
os.unlink(cached_file_name)
del local_cache[url]
def wget_url(url, target_file):
urlp = urlparse(url)
wget_params = []
site = urlp.netloc
if urlp.username:
wget_params = wget_params + ['--auth-no-challenge',
'--http-user=%s' % urlp.username,
'--http-password=%s' %
('""' if not urlp.password else urlp.password)]
site = site[site.find('@') + 1:]
url = urlunparse((urlp.scheme, site, urlp.path, urlp.params,
urlp.query, urlp.fragment))
print 'Downloading %s...' % url
if target_file is None:
wget_params += ['-nv', '-O-', url]
else:
wget_params += ['-nv', '-O', target_file, url]
p = subprocess.Popen(['wget'] + wget_params,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
wget_out, wget_err = p.communicate()
if p.returncode != 0:
print >> sys.stderr, ('Unable to get data from the url: %s '
'(error: %d).\n%s\n%s') % \
(url, p.returncode, wget_out, wget_err)
raise Exception('Unable to download data (%d).' % p.returncode)
if target_file is None:
return wget_out
def parse_index_html(index_html, base_url, filter_ext):
file_list = []
for match in re.finditer(r'href="([^"]+)"', index_html, re.M):
filename = match.group(1)
if filename.endswith(filter_ext):
filepath = os.path.join(base_url, filename)
if os.path.dirname(filepath) == base_url.rstrip('/') and \
os.path.basename(filepath) == filename:
file_list.append(filepath)
return file_list
def download_repodir(source_urlp, cache_dir):
site = source_urlp.netloc
site = site[site.find('@') + 1:]
target_dir = os.path.join(cache_dir,
site,
source_urlp.path.lstrip('/'))
if not os.path.isdir(target_dir):
os.makedirs(target_dir)
remote_files = {}
if source_urlp.scheme in ['http', 'https']:
source_url = source_urlp.geturl()
remote_dir_contents = parse_index_html(wget_url(source_url, None),
source_url, '.rpm')
for remote_file in remote_dir_contents:
remote_filename = urllib.unquote(os.path.basename(remote_file))
remote_files[remote_filename] = True
target_file = os.path.join(target_dir, remote_filename)
if os.path.isfile(target_file):
continue
wget_url(remote_file, target_file)
for local_filename in os.listdir(target_dir):
if local_filename not in remote_files and \
local_filename.endswith('.rpm'):
print 'Removing local file: %s.' % local_filename
os.unlink(os.path.join(target_dir, local_filename))
return target_dir
def urpm_get_packages(media):
p = subprocess.Popen(['urpmq', '-r', '--ignorearch',
'--list', '--media', media],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
urpmq_out, urpmq_err = p.communicate()
if p.returncode != 0 or len(urpmq_err) > 0:
print >> sys.stderr, ('Unable to get a list of packages '
'from the media: %s.\n'
'%s\n%s') % (media, urpmq_out, urpmq_err)
raise Exception('Unable to get a list of packages (%d).' % p.returncode)
# urpmi --no-install --allow-nodeps --force
# --download-all=/tmp/ xine-wavpack-1.2.4-1plf --media Desktop2012.1-8
p = subprocess.Popen(['urpmq', '-f', '--ignorearch',
'--list', '--media', media],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
urpmq_out, urpmq_err = p.communicate()
if p.returncode != 0 or len(urpmq_err) > 0:
print >> sys.stderr, ('Unable to get a list of packages '
'from the media: %s.\n'
'%s\n%s') % (media, urpmq_out, urpmq_err)
raise Exception('Unable to get a list of packages (%d).' % p.returncode)
#qr_lines = urpmq_out.split('\n')
raise Exception('Not implemented.')
def urpm_get_repodir(repodir_name, cache_dir):
target_dir = os.path.join(cache_dir,
repodir_name,
'rpms')
if not os.path.isdir(target_dir):
os.makedirs(target_dir)
urpm_files = {}
urpm_media_contents = urpm_get_packages(repodir_name)
raise Exception('Not implemented.')
2014-02-06 11:44:02 +04:00
generator_value = 0
2014-02-06 11:44:02 +04:00
def process_repodir(xrepodir, repodir_id, cache_dir, build_archs, conn,
db_struct, temp_dir, no_shared_objects, no_so_symbols):
repodir_url = xrepodir.get('url')
urlp = urlparse(repodir_url)
working_url = repodir_url
if cache_dir is not None:
if urlp.scheme in ['http', 'https']:
working_url = download_repodir(urlp, cache_dir)
elif urlp.scheme == 'urpm':
working_url = urpm_get_repodir(xrepodir.get('name'), cache_dir)
elif urlp.scheme not in ['', 'file']:
raise Exception('Invalid scheme in the repository url: %s' %
repodir_url)
2014-02-06 11:44:02 +04:00
rpm_list = []
rpm_list = get_files(working_url, '.rpm')
2014-02-06 11:44:02 +04:00
if not rpm_list:
return
print urlp.netloc[urlp.netloc.find('@') + 1:] + urlp.path, ': ', \
len(rpm_list)
2014-02-06 11:44:02 +04:00
if not db_struct.get('defined'):
rpm_ts = rpm.TransactionSet()
rpm_ts.setVSFlags(~(rpm.RPMVSF_NEEDPAYLOAD))
# ts.setVSFlags(~(rpm.RPMVSF_NOMD5|rpm.RPMVSF_NEEDPAYLOAD))
hdr = get_rpm_header(rpm_ts, get_local_file(rpm_list[0], temp_dir))
2014-02-06 11:44:02 +04:00
2014-02-12 17:37:05 +04:00
# Retain sort order!
2014-02-06 11:44:02 +04:00
packages_extra_fields = {'repodir_id': 'INTEGER',
2014-02-12 17:37:05 +04:00
'rpm_url': 'TEXT',
2014-02-06 11:44:02 +04:00
'sourcerpm_package': 'TEXT'}
2014-02-07 15:02:11 +04:00
file_tags_re = r'^RPMTAG_(BASENAMES|FILE[\w\d]+)'
dir_tags_re = r'^RPMTAG_DIR(INDEXES|NAMES)'
changelog_tags_re = r'^RPMTAG_CHANGELOG\w+'
trigger_tags_re = r'^RPMTAG_TRIGGER\w+'
2014-02-06 11:44:02 +04:00
datetime_tags = ['RPMTAG_PACKAGETIME', 'RPMTAG_RPMLIBTIMESTAMP', ]
db_struct['blob_tags'] = ['RPMTAG_RSAHEADER', 'RPMTAG_DSAHEADER',
'RPMTAG_HEADERIMMUTABLE', 'RPMTAG_SIGMD5',
'RPMTAG_PKGID', 'RPMTAG_SOURCEPKGID']
2014-02-06 11:44:02 +04:00
reserved_field_names = ['id', 'group']
skip_tags_re = '^RPMTAG_(C|D|E|N|P|R|V|HEADERIMMUTABLE)$'
#C - CONFLICTNAME, D - DISTEPOCH, E - EPOCH, N - NAME, O - OBSOLETENAME
#P - PROVIDENAME, R - RELEASE, V - VERSION
types = {"<type 'str'>" : "TEXT", "<type 'int'>": "INTEGER",
"<type 'NoneType'>": "TEXT", "<type 'list'>": "TEXT"}
2014-02-07 15:02:11 +04:00
dep_tags_re = r'^RPMTAG_(CONFLICT|OBSOLETE|PROVIDE|REQUIRE)\w+'
2014-02-06 11:44:02 +04:00
db_struct['dep_tables'] = ['rpm_requires', 'rpm_provides',
'rpm_conflicts', 'rpm_obsoletes']
2014-02-06 11:44:02 +04:00
packages_field_names = 'id, '
packages_values_template = '?,'
packages_tags = []
packages_fields = ''
rpmtags = [str(t) for t in dir(rpm) if t.startswith('RPMTAG_') ]
for tag in rpmtags:
if (re.match(file_tags_re, tag) or re.match(dir_tags_re, tag) or
re.match(changelog_tags_re, tag) or
re.match(skip_tags_re, tag) or
re.match(trigger_tags_re, tag) or
re.match(dep_tags_re, tag)):
2014-02-06 11:44:02 +04:00
continue
sqltype = "TIMESTAMP" if tag in datetime_tags else \
"BLOB" if tag in db_struct['blob_tags'] else \
types[str(type(hdr[tag]))]
fieldname = tag.replace('RPMTAG_', '').lower()
2014-02-07 15:02:11 +04:00
if fieldname in reserved_field_names:
2014-02-06 11:44:02 +04:00
fieldname = 'rpm_' + fieldname
packages_tags.append(tag)
packages_field_names += fieldname + ', '
packages_values_template += '?, '
packages_fields += fieldname + ' ' + sqltype + ', '
nef = 0
for extra_field in sorted(packages_extra_fields.keys()):
packages_field_names += (', ' if nef > 0 else '') + extra_field
packages_values_template += (', ' if nef > 0 else '') + '?'
2014-02-07 15:02:11 +04:00
packages_fields += (', ' if nef > 0 else '') + extra_field + ' ' + \
packages_extra_fields[extra_field]
2014-02-06 11:44:02 +04:00
nef += 1
conn.execute("""
CREATE TABLE IF NOT EXISTS packages(id INTEGER PRIMARY KEY NOT NULL, %s)
""" % (packages_fields))
for table in db_struct['dep_tables']:
conn.execute("""
CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY NOT NULL,
name TEXT, flags INTEGER, version TEXT, build_arch TEXT,
package_id INTEGER NOT NULL)""" % (table))
2014-02-06 11:44:02 +04:00
conn.commit()
db_struct['packages_tags'] = packages_tags
db_struct['packages_field_names'] = packages_field_names
db_struct['packages_values_template'] = packages_values_template
db_struct['defined'] = True
queue_in = mp.JoinableQueue()
for pkg in rpm_list:
queue_in.put((pkg, ))
for i in xrange(NUM_PROCESSES):
queue_in.put(None)
# Trying to prevent Exception AssertionError: AssertionError() in
# <Finalize object, dead> ignored
gc.collect()
2014-02-06 11:44:02 +04:00
time.sleep(1)
gc.disable()
global generator_value
id_generator = mp.Value('i', generator_value)
generator_lock = mp.Lock()
# run workers
2014-02-06 11:44:02 +04:00
workers = []
for i in xrange(NUM_PROCESSES):
worker = mp.Process(target = process_package_worker,
2014-02-07 15:02:11 +04:00
args = (i, queue_in, id_generator,
generator_lock, db_struct,
repodir_id, build_archs, temp_dir,
2014-02-07 13:53:14 +04:00
no_shared_objects, no_so_symbols))
2014-02-06 11:44:02 +04:00
workers.append(worker)
worker.start()
queue_in.join()
gc.enable()
generator_value = id_generator.value
def main(args):
global NUM_PROCESSES
2014-02-06 11:44:02 +04:00
if os.path.exists(DB):
os.unlink(DB)
if hasattr(os, "sysconf"):
if os.sysconf_names.has_key("SC_NPROCESSORS_ONLN"):
2014-02-07 15:02:11 +04:00
nproc = os.sysconf("SC_NPROCESSORS_ONLN")
if isinstance(nproc, int) and nproc > 0:
NUM_PROCESSES = nproc
2014-02-06 11:44:02 +04:00
conn = sqlite3.connect(DB)
init_database(conn)
conn.commit()
2014-02-07 15:02:11 +04:00
options = parseargs()
2014-02-06 11:44:02 +04:00
parser = ET.XMLParser()
tree = ET.parse(options.config, parser=parser)
config_root = tree.getroot()
temp_dir = '/dev/shm/rt-tmp/'
shutil.rmtree(temp_dir, True)
os.mkdir(temp_dir)
2014-02-06 11:44:02 +04:00
rpm_db_struct = {}
for xrepodir in config_root.find('repositories').findall('dir'):
repodir_id = add_repodir(xrepodir, conn)
build_archs = [None] if xrepodir.get('sources') != '.' else \
get_build_archs(xrepodir,
config_root.find('repositories'))
process_repodir(xrepodir, repodir_id, options.cache_dir,
build_archs, conn, rpm_db_struct, temp_dir,
options.no_shared_objects, options.no_so_symbols)
shutil.rmtree(temp_dir, True)
if rpm_db_struct.get('defined'):
index_database(conn)
else:
print 'Database was not initialized ' \
'(check whether repositories are empty).'
os.unlink(DB)
2014-02-06 11:44:02 +04:00
if __name__ == "__main__":
main(sys.argv)