From da73a2c84015a1b75377ab0a78dcb9c1ea2858fe Mon Sep 17 00:00:00 2001 From: Gerrod Ubben Date: Sat, 21 Oct 2023 03:23:30 -0400 Subject: [PATCH] Add dependency syncing fixes: #340 --- CHANGES/340.feature | 1 + docs/workflows/sync.rst | 29 ++++++++ .../0012_pythonremote_sync_dependencies.py | 17 +++++ pulp_python/app/models.py | 1 + pulp_python/app/serializers.py | 7 +- pulp_python/app/tasks/sync.py | 73 ++++++++++++++++++- pulp_python/app/utils.py | 6 ++ pulp_python/tests/functional/api/test_sync.py | 64 ++++++++++++++++ pulp_python/tests/functional/constants.py | 2 + 9 files changed, 196 insertions(+), 4 deletions(-) create mode 100644 CHANGES/340.feature create mode 100644 pulp_python/app/migrations/0012_pythonremote_sync_dependencies.py diff --git a/CHANGES/340.feature b/CHANGES/340.feature new file mode 100644 index 00000000..59e43157 --- /dev/null +++ b/CHANGES/340.feature @@ -0,0 +1 @@ +Added ability (tech-preview) to recursively sync dependencies from a remote with `sync_dependencies=True`. diff --git a/docs/workflows/sync.rst b/docs/workflows/sync.rst index d6719fe0..caa6a778 100644 --- a/docs/workflows/sync.rst +++ b/docs/workflows/sync.rst @@ -68,6 +68,11 @@ Remote Create Response:: ], "excludes": [], "prereleases": true, + "package_types": [], + "keep_latest_packages": 0, + "exclude_platforms": [], + "sync_dependencies": false + } @@ -119,6 +124,30 @@ Reference: `Python Remote Usage <../restapi.html#tag/Remotes:-Python>`_ .. _mirror-workflow: +Syncing Dependencies +-------------------- + +When specifying included packages to sync, Pulp can also sync the dependencies of those packages:: + + $ pulp python remote create \ + --name 'packages-with-dependencies' \ + --url 'https://pypi.org/' \ + --sync-dependencies # Enable syncing dependencies for included packages \ + --includes '[ + "django>=4.0", # Sync the dependencies for each django version >=4.0 + "pulpcore[s3]", # Sync the dependencies for all pulpcore versions + extra dependencies for s3 + ]' + +Turning on dependency syncing will only sync the necessary dependencies to install the package for the +given versions declared in the includes list. You can sync the extra dependencies of a package by +using the `extras notation `_. Synced dependencies are also +filtered through the other filters defined on the remote. Note that many packages have unrestricted +dependencies which can cause syncs to become significantly larger than normal. It is recommended +to use extra filters to trim any unwanted packages. + +.. warning:: This feature is provided as a tech preview and could change in backwards incompatible + ways in the future. + Creating a remote to sync all of PyPI _____________________________________ diff --git a/pulp_python/app/migrations/0012_pythonremote_sync_dependencies.py b/pulp_python/app/migrations/0012_pythonremote_sync_dependencies.py new file mode 100644 index 00000000..b9a887a8 --- /dev/null +++ b/pulp_python/app/migrations/0012_pythonremote_sync_dependencies.py @@ -0,0 +1,17 @@ +# Generated by Django 4.2.5 on 2023-10-19 03:05 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("python", "0011_alter_pythondistribution_distribution_ptr_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="pythonremote", + name="sync_dependencies", + field=models.BooleanField(default=False), + ), + ] diff --git a/pulp_python/app/models.py b/pulp_python/app/models.py index 0e8b29fc..d62a97c7 100644 --- a/pulp_python/app/models.py +++ b/pulp_python/app/models.py @@ -235,6 +235,7 @@ class PythonRemote(Remote): exclude_platforms = ArrayField( models.CharField(max_length=10, blank=True), choices=PLATFORMS, default=list ) + sync_dependencies = models.BooleanField(default=False) def get_remote_artifact_url(self, relative_path=None, request=None): """Get url for remote_artifact""" diff --git a/pulp_python/app/serializers.py b/pulp_python/app/serializers.py index 2573a1e1..428f3def 100644 --- a/pulp_python/app/serializers.py +++ b/pulp_python/app/serializers.py @@ -325,6 +325,11 @@ class PythonRemoteSerializer(core_serializers.RemoteSerializer): choices=python_models.PLATFORMS, default=list ) + sync_dependencies = serializers.BooleanField( + required=False, + help_text=_("Whether to sync dependencies specified by package metadata. (Tech Preview)"), + default=False, + ) def validate_includes(self, value): """Validates the includes""" @@ -351,7 +356,7 @@ def validate_excludes(self, value): class Meta: fields = core_serializers.RemoteSerializer.Meta.fields + ( "includes", "excludes", "prereleases", "package_types", "keep_latest_packages", - "exclude_platforms", + "exclude_platforms", "sync_dependencies" ) model = python_models.PythonRemote diff --git a/pulp_python/app/tasks/sync.py b/pulp_python/app/tasks/sync.py index c4f93dfa..cd1c0817 100644 --- a/pulp_python/app/tasks/sync.py +++ b/pulp_python/app/tasks/sync.py @@ -1,9 +1,13 @@ import logging +import tempfile +from typing import Dict, Set from aiohttp import ClientResponseError, ClientError +from collections import defaultdict +from itertools import chain from lxml.etree import LxmlError from gettext import gettext as _ -from os import environ, path +from os import environ from rest_framework import serializers @@ -19,13 +23,14 @@ PythonPackageContent, PythonRemote, ) -from pulp_python.app.utils import parse_metadata, PYPI_LAST_SERIAL +from pulp_python.app.utils import parse_json, parse_metadata, PYPI_LAST_SERIAL from pypi_simple import parse_repo_index_page from bandersnatch.mirror import Mirror from bandersnatch.master import Master from bandersnatch.configuration import BandersnatchConfig from packaging.requirements import Requirement +from packaging.utils import canonicalize_name from urllib.parse import urljoin, urlsplit, urlunsplit logger = logging.getLogger(__name__) @@ -113,7 +118,8 @@ async def run(self): """ # Prevent bandersnatch from reading actual .netrc file, set to nonexistent file # See discussion on https://github.com/pulp/pulp_python/issues/581 - environ["NETRC"] = f"{path.curdir}/.fake-netrc" + fake_netrc = tempfile.NamedTemporaryFile(dir=".", delete=False) + environ["NETRC"] = fake_netrc.name # TODO Change Bandersnatch internal API to take proxy settings in from config parameters if proxy_url := self.remote.proxy_url: if self.remote.proxy_username or self.remote.proxy_password: @@ -146,6 +152,23 @@ async def run(self): Requirement(pkg).name for pkg in self.remote.includes ] await pmirror.synchronize(packages_to_sync) + if pmirror.sync_dependencies: + depth = 1 + while pmirror.dependencies_to_sync and depth <= 25: # ensure no circular loops + logger.info(_("Syncing dependencies: depth {}").format(depth)) + depth += 1 + packages_to_sync = list(pmirror.dependencies_to_sync.keys()) + pmirror.allow_filter.allowlist_release_requirements = list( + chain(*pmirror.dependencies_to_sync.values()) + ) + logger.info( + f"Re-initialized release plugin {pmirror.allow_filter.name}, filtering " + + f"{pmirror.allow_filter.allowlist_release_requirements}" + ) + pmirror.dependencies_to_sync.clear() + await pmirror.synchronize(packages_to_sync) + if pmirror.dependencies_to_sync: + logger.warning(_("Reached dependency sync depth limit! Breaking out")) class PulpMirror(Mirror): @@ -160,8 +183,18 @@ def __init__( super().__init__(master=master, workers=workers) self.synced_serial = serial self.python_stage = python_stage + self.remote = self.python_stage.remote self.progress_report = progress_report self.deferred_download = deferred_download + self.sync_dependencies = self.remote.includes and self.remote.sync_dependencies + if self.sync_dependencies: + # Find the allowlist_filter, so we can update it when syncing dependencies + for fil in self.filters.filter_release_plugins(): + if fil.name == "allowlist_release": + self.allow_filter = fil + break + self.already_synced: Dict[str, Set[str]] = defaultdict(set) + self.dependencies_to_sync: Dict[str, Set[Requirement]] = defaultdict(set) async def determine_packages_to_sync(self): """ @@ -230,6 +263,28 @@ async def create_content(self, pkg): create a Content Unit to put into the pipeline """ for version, dists in pkg.releases.items(): + if self.sync_dependencies: + if version in self.already_synced[pkg.name]: + continue + self.already_synced[pkg.name].add(version) + + for req_spec in await self.get_required_dists(pkg, version): + req = Requirement(req_spec) + req.name = canonicalize_name(req.name) + req.specifier.prereleases = True + if req.marker: + if "extra == " in str(req.marker): + # Only sync the required dependency if we specified the correct 'extra' + extras = set() + for cur_allow_pkg in self.allow_filter.allowlist_release_requirements: + if cur_allow_pkg.name == pkg.name: + extras |= cur_allow_pkg.extras + extra = str(req.marker).rpartition("extra == ")[2].strip("'\"") + if extra not in extras: + continue + + self.dependencies_to_sync[req.name].add(req) + for package in dists: entry = parse_metadata(pkg.info, version, package) url = entry.pop("url") @@ -258,3 +313,15 @@ def on_error(self, exception, **kwargs): This should have some error checking """ pass + + async def get_required_dists(self, pkg, version): + """Returns a list of required dists from given package version.""" + # TODO: Can this logic live in Bandersnatch? + url = urljoin(self.remote.url, f"pypi/{pkg.name}/{version}/json") + downloader = self.remote.get_downloader(url=url) + try: + result = await downloader.run() + except ClientResponseError: + return [] + else: + return parse_json(result).get("info", {}).get("requires_dist", []) or [] diff --git a/pulp_python/app/utils.py b/pulp_python/app/utils.py index eb1082de..57f83524 100644 --- a/pulp_python/app/utils.py +++ b/pulp_python/app/utils.py @@ -312,3 +312,9 @@ def write_simple_detail(project_name, project_packages, streamed=False): detail = Template(simple_detail_template) context = {"project_name": project_name, "project_packages": project_packages} return detail.stream(**context) if streamed else detail.render(**context) + + +def parse_json(download_result): + """Parses JSON file.""" + with open(download_result.path) as fd: + return json.load(fd) diff --git a/pulp_python/tests/functional/api/test_sync.py b/pulp_python/tests/functional/api/test_sync.py index b64e8404..2732eafa 100644 --- a/pulp_python/tests/functional/api/test_sync.py +++ b/pulp_python/tests/functional/api/test_sync.py @@ -34,6 +34,8 @@ PYTHON_LG_FIXTURE_SUMMARY, PYTHON_LG_FIXTURE_COUNTS, DJANGO_LATEST_3, + DJANGO_PLUS_PYTZ, + DJANGO_PLUS_PYTZ_BCRYPT, SCIPY_COUNTS, ) from pulp_python.tests.functional.utils import gen_python_client, gen_python_remote @@ -687,6 +689,68 @@ def test_proxy_auth_sync( assert content_resp.count == 2 +@pytest.mark.parallel +def test_sync_dependency( + python_repo, python_repo_api_client, python_content_api_client, python_remote_factory +): + """Test syncing dependencies.""" + # The only required dependency for Django in our fixtures is pytz + body = gen_python_remote(includes=["Django"], sync_dependencies=True, prereleases=True) + remote = python_remote_factory(**body) + sync_resp = python_repo_api_client.sync(python_repo.pulp_href, {"remote": remote.pulp_href}) + monitor_task(sync_resp.task) + + repo = python_repo_api_client.read(python_repo.pulp_href) + assert repo.latest_version_href[-2] == "1" + + content_resp = python_content_api_client.list(repository_version=repo.latest_version_href) + assert content_resp.count == DJANGO_PLUS_PYTZ + + content_resp = python_content_api_client.list( + repository_version=repo.latest_version_href, name="pytz" + ) + assert content_resp.count > 0 + + +@pytest.mark.parallel +def test_sync_dependency_extras( + python_repo, python_repo_api_client, python_content_api_client, python_remote_factory +): + """Test syncing dependencies w/ extras""" + body = gen_python_remote(includes=["Django[bcrypt]"], sync_dependencies=True, prereleases=True) + remote = python_remote_factory(**body) + sync_resp = python_repo_api_client.sync(python_repo.pulp_href, {"remote": remote.pulp_href}) + monitor_task(sync_resp.task) + + repo = python_repo_api_client.read(python_repo.pulp_href) + assert repo.latest_version_href[-2] == "1" + + content_resp = python_content_api_client.list(repository_version=repo.latest_version_href) + assert content_resp.count == DJANGO_PLUS_PYTZ_BCRYPT + + content_resp = python_content_api_client.list( + repository_version=repo.latest_version_href, name="bcrypt" + ) + assert content_resp.count > 0 + + +@pytest.mark.parallel +def test_sync_dependency_not_present( + python_repo, python_repo_api_client, python_content_api_client, python_remote_factory +): + """Test syncing dependencies that are not present in the upstream doesn't fail the sync.""" + body = gen_python_remote(includes=["scipy"], sync_dependencies=True) + remote = python_remote_factory(**body) + sync_resp = python_repo_api_client.sync(python_repo.pulp_href, {"remote": remote.pulp_href}) + monitor_task(sync_resp.task) + + repo = python_repo_api_client.read(python_repo.pulp_href) + assert repo.latest_version_href[-2] == "1" + + content_resp = python_content_api_client.list(repository_version=repo.latest_version_href) + assert content_resp.count == SCIPY_COUNTS["total"] + + def sync_to_remote(self, body, create=False, mirror=False): """Takes a body and creates/updates a remote object, then it performs a sync""" if create: diff --git a/pulp_python/tests/functional/constants.py b/pulp_python/tests/functional/constants.py index d37c8f07..dd48b9a1 100644 --- a/pulp_python/tests/functional/constants.py +++ b/pulp_python/tests/functional/constants.py @@ -158,6 +158,8 @@ } DJANGO_LATEST_3 = 4 # latest version has 2 dists, each other has 1 +DJANGO_PLUS_PYTZ = 37 +DJANGO_PLUS_PYTZ_BCRYPT = 45 SCIPY_COUNTS = { "total": 23, # scipy has 23 different release files for the same release "windows": 8,