From 865dff2a86592307b6717c6c379f3a711a317f73 Mon Sep 17 00:00:00 2001 From: Alexander Lakhin Date: Wed, 12 Feb 2014 16:08:24 +0400 Subject: [PATCH] Implement support for remote repositories --- README | 3 +- fill-repodb.py | 242 +++++++++++++++++++++++++++++++++------- repo-analyze-config.xml | 32 +++--- 3 files changed, 217 insertions(+), 60 deletions(-) diff --git a/README b/README index 5b31b5a..41d56f8 100644 --- a/README +++ b/README @@ -4,7 +4,8 @@ 2. Заполнить базу данных информацией из репозиториев: fill-repodb.py repo-analyze-config.xml * Для ускорения можно отключить обаботку .so и их символов ключами -O -S соответственно. -** Скрипт создаёт в текущем каталоге базу данных repo.db размером около 2 Гб (при полной обработке). +** При использовании удалённых репозиториев рекомендуется применять кеш: -с cache-dir +*** Cкрипт создаёт в текущем каталоге базу данных repo.db размером около 2 Гб (при полной обработке, для репозиториев Chrome). 3. Подготовить базу данных к анализу: prepare-repodb.py 4. Выполнить анализ/проверки: diff --git a/fill-repodb.py b/fill-repodb.py index e98aa28..a7edf81 100755 --- a/fill-repodb.py +++ b/fill-repodb.py @@ -14,6 +14,8 @@ import shutil import time import multiprocessing as mp import gc +import urllib +from urlparse import urlparse, urlunparse gettext.install('urpm-tools') @@ -24,17 +26,16 @@ NUM_PROCESSES = 4 # number of CPU's (evaluated automatically) RPMFILEMODE_DIRECTORY = 0x4000 RPMFILEMODE_EXECUTE = 0111 -def getFileList(path, ext, filelist): - extlen = len(ext) - dir_list = os.listdir(path) - +def get_files(url, ext): + filelist = [] + urlp = urlparse(url) + if urlp.scheme in ['http', 'https']: + return parse_index_html(wget_url(url, None), url, '.rpm') + dir_list = os.listdir(url) for d in dir_list: - if os.path.isdir(path + '/' + d): - filelist = getFileList(path + '/' + d, ext, filelist) - else: - if d[-extlen:].lower() == ext: - newpath = os.path.normpath(path + '/' + d) - filelist.append(newpath) + if d.endswith(ext): + filepath = os.path.normpath(os.path.join(url, d)) + filelist.append(filepath) return filelist def parseargs(): @@ -42,6 +43,8 @@ def parseargs(): ' from RPM repositories')) parser.add_argument('config', metavar='config', help=_('path to repo-analyze-config.xml')) + parser.add_argument('-c', '--cache-dir', + help=_('path to cache directory')) parser.add_argument('-O', '--no-shared-objects', action='store_true', help=_('don\'t process shared objects')) parser.add_argument('-S', '--no-so-symbols', action='store_true', @@ -301,17 +304,19 @@ def process_package_worker(num, queue_in, generator, gen_lock, db_struct, (pkg, ) = job pkg_id = generate_new_id(generator, gen_lock) + local_pkg = get_local_file(pkg, temp_dir) - hdr = get_rpm_header(rpm_ts, pkg) + hdr = get_rpm_header(rpm_ts, local_pkg) package_values = [] package_values.append(pkg_id) for tag in db_struct['packages_tags']: hval = hdr[tag] package_values.append( - sqlite3.Binary(hval) if tag in db_struct['blob_tags'] else \ - to_string(pkg, tag, hval) if type(hval) in [type([]), type('')] else \ - hval + (sqlite3.Binary(hval) if len(hval)>0 else None) + if tag in db_struct['blob_tags'] else \ + to_string(pkg, tag, hval) if \ + type(hval) in [type([]), type('')] else hval ) package_values.append(repodir_id) package_values.append(pkg) @@ -328,9 +333,9 @@ def process_package_worker(num, queue_in, generator, gen_lock, db_struct, dep_flags[i], dep_version[i], pkg_id, build_arch]) - # fonts-ttf-decoratives-1.3-27-rosa.lts2012.0.noarch.rpm provides font(derdämonschriftkegel) (pkg_file_paths, pkg_file_names, pkg_file_sizes, pkg_file_modes) = \ - (hdr['RPMTAG_FILEPATHS'], hdr['RPMTAG_BASENAMES'], hdr['RPMTAG_FILESIZES'], hdr['RPMTAG_FILEMODES']) + (hdr['RPMTAG_FILEPATHS'], hdr['RPMTAG_BASENAMES'], + hdr['RPMTAG_FILESIZES'], hdr['RPMTAG_FILEMODES']) files_list = data['package_files'] files_dirs = {} obj_so_files_idx = [] @@ -368,7 +373,8 @@ def process_package_worker(num, queue_in, generator, gen_lock, db_struct, for fdir in sorted(files_dirs.keys()): if files_dirs[fdir]: - # Add parent directories as implicit files # TODO: recursive processing? + # Add parent directories as implicit files + # TODO: recursive processing? pkg_file_id = generate_new_id(generator, gen_lock) files_list.append([pkg_file_id, #FILE_REC_ID_IDX = 0 pkg_id, @@ -381,15 +387,17 @@ def process_package_worker(num, queue_in, generator, gen_lock, db_struct, ]) if obj_so_files_idx: - pkg_temp_dir = os.path.join(temp_dir, os.path.basename(pkg)) + pkg_temp_dir = os.path.join(temp_dir, os.path.basename(local_pkg)) os.makedirs(pkg_temp_dir) - if extract_files(pkg, files_list, obj_so_files_idx, pkg_temp_dir): + if extract_files(local_pkg, files_list, + obj_so_files_idx, pkg_temp_dir): for i in obj_so_files_idx: register_object(data, files_list[i], pkg_temp_dir, no_so_symbols) shutil.rmtree(pkg_temp_dir, True) + remove_cached_file(pkg) queue_in.task_done() conn = sqlite3.connect(DB, timeout=30) @@ -419,25 +427,164 @@ INSERT INTO obj_symbols(obj_file_id, name, sym_type) VALUES(?, ?, ?) conn.commit() queue_in.task_done() +local_cache = {} +def get_local_file(url, temp_dir): + urlp = urlparse(url) + if urlp.scheme in ['http', 'https']: + cached_file_name = local_cache.get(url) + if cached_file_name and os.path.isfile(cached_file_name): + return cached_file_name + cache_dir = os.path.join(temp_dir, 'cache') + if not os.path.isdir(cache_dir): + os.makedirs(cache_dir) + temp_file = os.path.join(cache_dir, os.path.basename(url)) + wget_url(url, temp_file) + local_cache[url] = temp_file + return temp_file + return url + +def remove_cached_file(url): + cached_file_name = local_cache.get(url) + if cached_file_name: + os.unlink(cached_file_name) + del local_cache[url] + +def wget_url(url, target_file): + urlp = urlparse(url) + wget_params = [] + site = urlp.netloc + if urlp.username: + wget_params = wget_params + ['--auth-no-challenge', + '--http-user=%s' % urlp.username, + '--http-password=%s' % + ('""' if not urlp.password else urlp.password)] + site = site[site.find('@') + 1:] + url = urlunparse((urlp.scheme, site, urlp.path, urlp.params, + urlp.query, urlp.fragment)) + print 'Downloading %s...' % url + if target_file is None: + wget_params += ['-nv', '-O-', url] + else: + wget_params += ['-nv', '-O', target_file, url] + p = subprocess.Popen(['wget'] + wget_params, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + wget_out, wget_err = p.communicate() + if p.returncode != 0: + print >> sys.stderr, ('Unable to get data from the url: %s ' + '(error: %d).\n%s\n%s') % \ + (url, p.returncode, wget_out, wget_err) + raise Exception('Unable to download data (%d).' % p.returncode) + if target_file is None: + return wget_out + +def parse_index_html(index_html, base_url, filter_ext): + file_list = [] + for match in re.finditer(r'href="([^"]+)"', index_html, re.M): + filename = match.group(1) + if filename.endswith(filter_ext): + filepath = os.path.join(base_url, filename) + if os.path.dirname(filepath) == base_url.rstrip('/') and \ + os.path.basename(filepath) == filename: + file_list.append(filepath) + return file_list + +def download_repodir(source_urlp, cache_dir): + site = source_urlp.netloc + site = site[site.find('@') + 1:] + target_dir = os.path.join(cache_dir, + site, + source_urlp.path.lstrip('/')) + if not os.path.isdir(target_dir): + os.makedirs(target_dir) + remote_files = {} + if source_urlp.scheme in ['http', 'https']: + source_url = source_urlp.geturl() + remote_dir_contents = parse_index_html(wget_url(source_url, None), + source_url, '.rpm') + for remote_file in remote_dir_contents: + remote_filename = urllib.unquote(os.path.basename(remote_file)) + remote_files[remote_filename] = True + target_file = os.path.join(target_dir, remote_filename) + if os.path.isfile(target_file): + continue + wget_url(remote_file, target_file) + + for local_filename in os.listdir(target_dir): + if local_filename not in remote_files and \ + local_filename.endswith('.rpm'): + print 'Removing local file: %s.' % local_filename + os.unlink(os.path.join(target_dir, local_filename)) + + return target_dir + +def urpm_get_packages(media): + p = subprocess.Popen(['urpmq', '-r', '--ignorearch', + '--list', '--media', media], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + urpmq_out, urpmq_err = p.communicate() + if p.returncode != 0 or len(urpmq_err) > 0: + print >> sys.stderr, ('Unable to get a list of packages ' + 'from the media: %s.\n' + '%s\n%s') % (media, urpmq_out, urpmq_err) + raise Exception('Unable to get a list of packages (%d).' % p.returncode) +# urpmi --no-install --allow-nodeps --force +# --download-all=/tmp/ xine-wavpack-1.2.4-1plf --media Desktop2012.1-8 + p = subprocess.Popen(['urpmq', '-f', '--ignorearch', + '--list', '--media', media], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + urpmq_out, urpmq_err = p.communicate() + if p.returncode != 0 or len(urpmq_err) > 0: + print >> sys.stderr, ('Unable to get a list of packages ' + 'from the media: %s.\n' + '%s\n%s') % (media, urpmq_out, urpmq_err) + raise Exception('Unable to get a list of packages (%d).' % p.returncode) + + #qr_lines = urpmq_out.split('\n') + raise Exception('Not implemented.') + +def urpm_get_repodir(repodir_name, cache_dir): + target_dir = os.path.join(cache_dir, + repodir_name, + 'rpms') + if not os.path.isdir(target_dir): + os.makedirs(target_dir) + urpm_files = {} + urpm_media_contents = urpm_get_packages(repodir_name) + raise Exception('Not implemented.') + generator_value = 0 -def process_repodir(repodir_path, repodir_id, build_archs, conn, db_struct, - tempdir, no_shared_objects, no_so_symbols): +def process_repodir(xrepodir, repodir_id, cache_dir, build_archs, conn, + db_struct, temp_dir, no_shared_objects, no_so_symbols): + repodir_url = xrepodir.get('url') + urlp = urlparse(repodir_url) + working_url = repodir_url + if cache_dir is not None: + if urlp.scheme in ['http', 'https']: + working_url = download_repodir(urlp, cache_dir) + elif urlp.scheme == 'urpm': + working_url = urpm_get_repodir(xrepodir.get('name'), cache_dir) + elif urlp.scheme not in ['', 'file']: + raise Exception('Invalid scheme in the repository url: %s' % + repodir_url) rpm_list = [] - rpm_list = getFileList(repodir_path, '.rpm', rpm_list) + rpm_list = get_files(working_url, '.rpm') if not rpm_list: return - print repodir_path, ': ', len(rpm_list) - + print urlp.netloc[urlp.netloc.find('@') + 1:] + urlp.path, ': ', \ + len(rpm_list) if not db_struct.get('defined'): rpm_ts = rpm.TransactionSet() rpm_ts.setVSFlags(~(rpm.RPMVSF_NEEDPAYLOAD)) # ts.setVSFlags(~(rpm.RPMVSF_NOMD5|rpm.RPMVSF_NEEDPAYLOAD)) - hdr = get_rpm_header(rpm_ts, rpm_list[0]) + hdr = get_rpm_header(rpm_ts, get_local_file(rpm_list[0], temp_dir)) packages_extra_fields = {'repodir_id': 'INTEGER', - 'rpm_filepath': 'TEXT', + 'package_url': 'TEXT', 'sourcerpm_package': 'TEXT'} file_tags_re = r'^RPMTAG_(BASENAMES|FILE[\w\d]+)' @@ -446,8 +593,9 @@ def process_repodir(repodir_path, repodir_id, build_archs, conn, db_struct, trigger_tags_re = r'^RPMTAG_TRIGGER\w+' datetime_tags = ['RPMTAG_PACKAGETIME', 'RPMTAG_RPMLIBTIMESTAMP', ] - db_struct['blob_tags'] = ['RPMTAG_DSAHEADER', 'RPMTAG_HEADERIMMUTABLE', - 'RPMTAG_PKGID', 'RPMTAG_SIGMD5'] + db_struct['blob_tags'] = ['RPMTAG_RSAHEADER', 'RPMTAG_DSAHEADER', + 'RPMTAG_HEADERIMMUTABLE', 'RPMTAG_SIGMD5', + 'RPMTAG_PKGID', 'RPMTAG_SOURCEPKGID'] reserved_field_names = ['id', 'group'] skip_tags_re = '^RPMTAG_(C|D|E|N|P|R|V|HEADERIMMUTABLE)$' @@ -469,10 +617,11 @@ def process_repodir(repodir_path, repodir_id, build_archs, conn, db_struct, rpmtags = [str(t) for t in dir(rpm) if t.startswith('RPMTAG_') ] for tag in rpmtags: - if re.match(file_tags_re, tag) or re.match(dir_tags_re, tag) or \ - re.match(changelog_tags_re, tag) or \ - re.match(skip_tags_re, tag) or re.match(trigger_tags_re, tag) or \ - re.match(dep_tags_re, tag): + if (re.match(file_tags_re, tag) or re.match(dir_tags_re, tag) or + re.match(changelog_tags_re, tag) or + re.match(skip_tags_re, tag) or + re.match(trigger_tags_re, tag) or + re.match(dep_tags_re, tag)): continue sqltype = "TIMESTAMP" if tag in datetime_tags else \ "BLOB" if tag in db_struct['blob_tags'] else \ @@ -513,19 +662,21 @@ CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY NOT NULL, for i in xrange(NUM_PROCESSES): queue_in.put(None) - # run workers - gc.collect() # Trying to prevent Exception AssertionError: AssertionError() in ignored + # Trying to prevent Exception AssertionError: AssertionError() in + # ignored + gc.collect() time.sleep(1) gc.disable() global generator_value id_generator = mp.Value('i', generator_value) generator_lock = mp.Lock() + # run workers workers = [] for i in xrange(NUM_PROCESSES): worker = mp.Process(target = process_package_worker, args = (i, queue_in, id_generator, generator_lock, db_struct, - repodir_id, build_archs, tempdir, + repodir_id, build_archs, temp_dir, no_shared_objects, no_so_symbols)) workers.append(worker) worker.start() @@ -553,20 +704,25 @@ def main(args): parser = ET.XMLParser() tree = ET.parse(options.config, parser=parser) config_root = tree.getroot() - tempdir = '/dev/shm/rt-tmp/' - shutil.rmtree(tempdir, True) - os.mkdir(tempdir) + temp_dir = '/dev/shm/rt-tmp/' + shutil.rmtree(temp_dir, True) + os.mkdir(temp_dir) rpm_db_struct = {} for xrepodir in config_root.find('repositories').findall('dir'): repodir_id = add_repodir(xrepodir, conn) build_archs = [None] if xrepodir.get('sources') != '.' else \ get_build_archs(xrepodir, config_root.find('repositories')) - process_repodir(xrepodir.get('path'), repodir_id, build_archs, conn, - rpm_db_struct, tempdir, options.no_shared_objects, - options.no_so_symbols) - shutil.rmtree(tempdir, True) - index_database(conn) + process_repodir(xrepodir, repodir_id, options.cache_dir, + build_archs, conn, rpm_db_struct, temp_dir, + options.no_shared_objects, options.no_so_symbols) + shutil.rmtree(temp_dir, True) + if rpm_db_struct.get('defined'): + index_database(conn) + else: + print 'Database was not initialized ' \ + '(check whether repositories are empty).' + os.unlink(DB) if __name__ == "__main__": diff --git a/repo-analyze-config.xml b/repo-analyze-config.xml index cde2d78..00da2d2 100644 --- a/repo-analyze-config.xml +++ b/repo-analyze-config.xml @@ -4,77 +4,77 @@ + url="http://{token}@abf-downloads.rosalinux.ru/rosa-dx-chrome-1.0/repository/i586/debug_main/release/"> + url="http://{token}@abf-downloads.rosalinux.ru/rosa-dx-chrome-1.0/repository/i586/main/release/"> + url="http://{token}@abf-downloads.rosalinux.ru/rosa-dx-chrome-1.0/repository/i586/main/updates/"> rosa-dx-chrome-1.0/i586/main/release + url="http://{token}@abf-downloads.rosalinux.ru/dx_rc_personal/repository/rosa-dx-chrome-1.0/i586/debug_main/release/"> rosa-dx-chrome-1.0/i586/main/release + url="http://{token}@abf-downloads.rosalinux.ru/dx_rc_personal/repository/rosa-dx-chrome-1.0/i586/main/release/"> rosa-dx-chrome-1.0/i586/main/release + url="http://{token}@abf-downloads.rosalinux.ru/rosa-dx-chrome-1.0/repository/x86_64/debug_main/release/"> + url="http://{token}@abf-downloads.rosalinux.ru/rosa-dx-chrome-1.0/repository/x86_64/main/release/"> + url="http://{token}@abf-downloads.rosalinux.ru/rosa-dx-chrome-1.0/repository/x86_64/main/updates/"> rosa-dx-chrome-1.0/x86_64/main/release + url="http://{token}@abf-downloads.rosalinux.ru/dx_rc_personal/repository/rosa-dx-chrome-1.0/x86_64/debug_main/release/"> rosa-dx-chrome-1.0/x86_64/main/release + url="http://{token}@abf-downloads.rosalinux.ru/dx_rc_personal/repository/rosa-dx-chrome-1.0/x86_64/main/release/"> rosa-dx-chrome-1.0/x86_64/main/release + url="http://{token}@abf-downloads.rosalinux.ru/rosa-dx-chrome-1.0/repository/SRPMS/main/release/"> rosa-dx-chrome-1.0/$arch/main/release + url="http://{token}@abf-downloads.rosalinux.ru/dx_rc_personal/repository/rosa-dx-chrome-1.0/SRPMS/main/release/"> rosa-dx-chrome-1.0/$arch/main/release dx_rc_personal/$arch/main/release