From 500a5e647186f66da5a3b9f9e2635b0b25d19726 Mon Sep 17 00:00:00 2001 From: Gerrod Ubben Date: Wed, 27 Oct 2021 14:47:41 -0400 Subject: [PATCH] Add `last_serial` sync optimization fixes: #351 --- CHANGES/351.feature | 2 + .../0011_pythonrepository_last_serial.py | 18 ++++++ pulp_python/app/models.py | 6 ++ pulp_python/app/serializers.py | 10 +++- pulp_python/app/tasks/__init__.py | 2 +- pulp_python/app/tasks/sync.py | 60 +++++++++++++++++-- pulp_python/app/viewsets.py | 18 ++++++ pulp_python/tests/unit/test_models.py | 44 ++++++++++++++ 8 files changed, 152 insertions(+), 8 deletions(-) create mode 100644 CHANGES/351.feature create mode 100644 pulp_python/app/migrations/0011_pythonrepository_last_serial.py diff --git a/CHANGES/351.feature b/CHANGES/351.feature new file mode 100644 index 00000000..779ebb52 --- /dev/null +++ b/CHANGES/351.feature @@ -0,0 +1,2 @@ +Added ``last_serial`` sync optimization to Python repositories. +Subsequent syncs will use ``last_serial`` to get the changed packages since the previous sync. diff --git a/pulp_python/app/migrations/0011_pythonrepository_last_serial.py b/pulp_python/app/migrations/0011_pythonrepository_last_serial.py new file mode 100644 index 00000000..c3e812e7 --- /dev/null +++ b/pulp_python/app/migrations/0011_pythonrepository_last_serial.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.8 on 2021-10-21 20:25 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('python', '0010_update_json_field'), + ] + + operations = [ + migrations.AddField( + model_name='pythonrepository', + name='last_serial', + field=models.IntegerField(default=0), + ), + ] diff --git a/pulp_python/app/models.py b/pulp_python/app/models.py index 0f10b2e7..b1223a68 100644 --- a/pulp_python/app/models.py +++ b/pulp_python/app/models.py @@ -5,6 +5,7 @@ from django.core.exceptions import ObjectDoesNotExist from django.db import models from django.conf import settings +from django_lifecycle import hook, BEFORE_UPDATE from yarl import URL from pulpcore.plugin.models import ( @@ -236,6 +237,7 @@ class PythonRepository(Repository): REMOTE_TYPES = [PythonRemote] autopublish = models.BooleanField(default=False) + last_serial = models.IntegerField(default=0) class Meta: default_related_name = "%(app_label)s_%(model_name)s" @@ -261,3 +263,7 @@ def finalize_new_version(self, new_version): """ remove_duplicates(new_version) validate_repo_version(new_version) + + @hook(BEFORE_UPDATE, when="remote", has_changed=True) + def clear_last_serial(self): + self.last_serial = 0 diff --git a/pulp_python/app/serializers.py b/pulp_python/app/serializers.py index b823df5d..a89329a1 100644 --- a/pulp_python/app/serializers.py +++ b/pulp_python/app/serializers.py @@ -28,9 +28,17 @@ class PythonRepositorySerializer(core_serializers.RepositorySerializer): default=False, required=False, ) + last_serial = serializers.IntegerField( + help_text=_( + "The serial number from the last successful sync. Used in the sync process to " + "optimize the sync based on changes from previous sync. Use mirror=True to bypass" + "this optimization." + ), + read_only=True, + ) class Meta: - fields = core_serializers.RepositorySerializer.Meta.fields + ("autopublish",) + fields = core_serializers.RepositorySerializer.Meta.fields + ("autopublish", "last_serial") model = python_models.PythonRepository diff --git a/pulp_python/app/tasks/__init__.py b/pulp_python/app/tasks/__init__.py index fb88725d..956c9cde 100644 --- a/pulp_python/app/tasks/__init__.py +++ b/pulp_python/app/tasks/__init__.py @@ -3,5 +3,5 @@ """ from .publish import publish # noqa:F401 -from .sync import sync # noqa:F401 +from .sync import sync, update_remote # noqa:F401 from .upload import upload, upload_group # noqa:F401 diff --git a/pulp_python/app/tasks/sync.py b/pulp_python/app/tasks/sync.py index 641aeab5..35d92449 100644 --- a/pulp_python/app/tasks/sync.py +++ b/pulp_python/app/tasks/sync.py @@ -3,9 +3,10 @@ from gettext import gettext as _ from os import environ +from django.db import transaction from rest_framework import serializers -from pulpcore.plugin.models import Artifact, ProgressReport, Remote, Repository +from pulpcore.plugin.models import Artifact, ProgressReport, Remote from pulpcore.plugin.stages import ( DeclarativeArtifact, DeclarativeContent, @@ -16,6 +17,7 @@ from pulp_python.app.models import ( PythonPackageContent, PythonRemote, + PythonRepository, ) from pulp_python.app.utils import parse_metadata @@ -27,6 +29,42 @@ logger = logging.getLogger(__name__) +def update_remote(remote_pk, *args, **kwargs): + """ + Update `PythonRepository.last_serial` when URL or filters are updated. + + Args: + remote_pk (str): the id of the PythonRemote + data (dict): dictionary whose keys represent the fields of the model and their corresponding + values. + partial (bool): When true, only the fields specified in the data dictionary are updated. + When false, any fields missing from the data dictionary are assumed to be None and + their values are updated as such. + + Raises: + :class:`rest_framework.exceptions.ValidationError`: When serializer instance can't be saved + due to validation error. This theoretically should never occur since validation is + performed before the task is dispatched. + """ + from pulp_python.app.serializers import PythonRemoteSerializer + data = kwargs.pop("data", {}) + partial = kwargs.pop("partial", False) + with transaction.atomic(): + instance = PythonRemote.objects.get(pk=remote_pk) + fields = {f.name for f in instance._meta.local_fields if f.name != "includes"} + fields.update({"url", "policy"}) + if fields.intersection(data): + repos = list(PythonRepository.objects.filter(remote_id=remote_pk, last_serial__gt=0)) + for repo in repos: + repo.last_serial = 0 + PythonRepository.objects.bulk_update(repos, ["last_serial"]) + + + serializer = PythonRemoteSerializer(instance, data=data, partial=partial) + serializer.is_valid(raise_exception=True) + serializer.save() + + def sync(remote_pk, repository_pk, mirror): """ Sync content from the remote repository. @@ -43,15 +81,21 @@ def sync(remote_pk, repository_pk, mirror): """ remote = PythonRemote.objects.get(pk=remote_pk) - repository = Repository.objects.get(pk=repository_pk) + repository = PythonRepository.objects.get(pk=repository_pk) if not remote.url: raise serializers.ValidationError( detail=_("A remote must have a url attribute to sync.") ) - first_stage = PythonBanderStage(remote) - DeclarativeVersion(first_stage, repository, mirror).create() + same_remote = getattr(repository.remote, "pk", None) == remote_pk + serial = repository.last_serial if same_remote else 0 + first_stage = PythonBanderStage(remote, mirror, serial) + version = DeclarativeVersion(first_stage, repository, mirror).create() + if version is not None and same_remote: + if first_stage.next_serial and first_stage.next_serial != repository.last_serial: + repository.last_serial = first_stage.next_serial + repository.save() def create_bandersnatch_config(remote): @@ -97,10 +141,13 @@ class PythonBanderStage(Stage): Python Package Syncing Stage using Bandersnatch """ - def __init__(self, remote): + def __init__(self, remote, mirror, last_serial): """Initialize the stage and Bandersnatch config""" super().__init__() self.remote = remote + # If mirror=True, then sync everything, don't use serial + self.serial = last_serial if not mirror else 0 + self.next_serial = None create_bandersnatch_config(remote) async def run(self): @@ -119,7 +166,7 @@ async def run(self): message="Fetching Project Metadata", code="sync.fetching.project" ) as p: pmirror = PulpMirror( - serial=0, # Serial currently isn't supported by Pulp + serial=self.serial, master=master, workers=workers, deferred_download=deferred_download, @@ -132,6 +179,7 @@ async def run(self): Requirement(pkg).name for pkg in self.remote.includes ] await pmirror.synchronize(packages_to_sync) + self.next_serial = pmirror.target_serial class PulpMirror(Mirror): diff --git a/pulp_python/app/viewsets.py b/pulp_python/app/viewsets.py index 88f38121..09173b3b 100644 --- a/pulp_python/app/viewsets.py +++ b/pulp_python/app/viewsets.py @@ -135,6 +135,24 @@ class PythonRemoteViewSet(core_viewsets.RemoteViewSet): queryset = python_models.PythonRemote.objects.all() serializer_class = python_serializers.PythonRemoteSerializer + @extend_schema( + description="Trigger an asynchronous update task", + responses={202: AsyncOperationResponseSerializer}, + ) + def update(self, request, pk, **kwargs): + """Update remote.""" + partial = kwargs.pop("partial", False) + lock = [self.get_object()] + repos = python_models.PythonRepository.objects.filter(remote_id=pk, last_serial__gt=0) + lock.extend(repos) + async_result = dispatch( + tasks.update_remote, + exclusive_resources=lock, + args=(pk,), + kwargs={"data": request.data, "partial": partial}, + ) + return core_viewsets.OperationPostponedResponse(async_result, request) + @extend_schema( summary="Create from Bandersnatch", responses={201: python_serializers.PythonRemoteSerializer}, diff --git a/pulp_python/tests/unit/test_models.py b/pulp_python/tests/unit/test_models.py index 16e9754d..84f8210d 100644 --- a/pulp_python/tests/unit/test_models.py +++ b/pulp_python/tests/unit/test_models.py @@ -1,5 +1,12 @@ from django.test import TestCase +from pulp_python.app.models import PythonRemote, PythonRepository +from pulp_python.app.tasks import update_remote + + +DEFAULT_SERIAL = 10000 +MAX_SERIAL = 20000 + class TestNothing(TestCase): """Test Nothing (placeholder).""" @@ -7,3 +14,40 @@ class TestNothing(TestCase): def test_nothing_at_all(self): """Test that the tests are running and that's it.""" self.assertTrue(True) + + +class TestRepositoryLastSerial(TestCase): + """Tests `last_serial` gets properly set and reset with syncs and remote changes.""" + + def setUp(self): + self.remote = PythonRemote.objects.create(name="test", url="https://pypi.org") + self.repo = PythonRepository.objects.create(name="test", remote=self.remote, last_serial=DEFAULT_SERIAL) + + def test_remote_change(self): + """Test that `last_serial` gets reset upon remote change.""" + self.assertEqual(self.repo.remote.pk, self.remote.pk) + self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL) + self.repo.remote = None + self.repo.save() + self.repo.refresh_from_db() + self.assertEqual(self.repo.last_serial, 0) + + def test_remote_update(self): + """Test that updating a remote will reset `last_serial`.""" + self.assertEqual(self.repo.remote.pk, self.remote.pk) + self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL) + # Remote is only updated through update task + new_body = {"url": "https://test.pypi.org"} + update_remote(self.remote.pk, data=new_body, partial=True) + self.repo.refresh_from_db() + self.assertEqual(self.repo.last_serial, 0) + + def test_remote_update_no_change(self): + """Test that changing 'includes' field doesn't reset `last_serial`.""" + self.assertEqual(self.repo.remote.pk, self.remote.pk) + self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL) + new_body = {"includes": ["shelf-reader"]} + update_remote(self.remote.pk, data=new_body, partial=True) + self.repo.refresh_from_db() + self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL) +