diff --git a/documentation/release_notes_7.markdown b/documentation/release_notes_7.markdown index 509193a911..b84aaad4a9 100644 --- a/documentation/release_notes_7.markdown +++ b/documentation/release_notes_7.markdown @@ -15,6 +15,35 @@ canonical: "/puppetdb/latest/release_notes.html" # PuppetDB: Release notes +## PuppetDB 7.20.1 + +Released TBD. + +### Bug fixes + +* Fixed an issue with report garbage collection where a partition would become + partially detached and block future garbage collection progress. Garbage + collection will now finalize the partition detach operation and remove the + table. ([GitHub #4013](https://github.com/puppetlabs/puppetdb/issues/4013)) +* Fixed an issue with report garbage collection where a partition would be + detached, but the table was never deleted. Garbage collection will now + identify and clean-up these tables. + ([GitHub #4013](https://github.com/puppetlabs/puppetdb/issues/4013)) + +## PuppetDB 7.20.0 + +Released October 22 2024 + +### Improvements + +* Released support and packages for Debian 12 (bookworm) + +### Bug fixes + +* Added a database constraint to prevent duplicate catalogs. If your database + contains any duplicate catalogs, only the most recent catalog for each + certname will be kept. + ## PuppetDB 7.19.1 Released July 23 2024 diff --git a/documentation/release_notes_8.markdown b/documentation/release_notes_8.markdown index 9ce1eb1e3d..d057b1ff39 100644 --- a/documentation/release_notes_8.markdown +++ b/documentation/release_notes_8.markdown @@ -14,6 +14,21 @@ canonical: "/puppetdb/latest/release_notes.html" ## PuppetDB 8.8.0 +Released TBD. + +### Bug fixes + +* Fixed an issue with report garbage collection where a partition would become + partially detached and block future garbage collection progress. Garbage + collection will now finalize the partition detach operation and remove the + table. ([GitHub #4013](https://github.com/puppetlabs/puppetdb/issues/4013)) +* Fixed an issue with report garbage collection where a partition would be + detached, but the table was never deleted. Garbage collection will now + identify and clean-up these tables. + ([GitHub #4013](https://github.com/puppetlabs/puppetdb/issues/4013)) + +## PuppetDB 8.8.0 + Released October 22 2024 ### Improvements diff --git a/src/puppetlabs/puppetdb/jdbc.clj b/src/puppetlabs/puppetdb/jdbc.clj index b80536b4ff..8a9c9adaeb 100644 --- a/src/puppetlabs/puppetdb/jdbc.clj +++ b/src/puppetlabs/puppetdb/jdbc.clj @@ -40,6 +40,7 @@ :invalid-regular-expression "2201B" :lock-not-available "55P03" :program-limit-exceeded "54000" + :not-in-prerequisite-state "55000" :query-canceled "57014" :serialization-failure "40001" ;; check constraint violation diff --git a/src/puppetlabs/puppetdb/scf/partitioning.clj b/src/puppetlabs/puppetdb/scf/partitioning.clj index 8d5912b5fe..2795ec4022 100644 --- a/src/puppetlabs/puppetdb/scf/partitioning.clj +++ b/src/puppetlabs/puppetdb/scf/partitioning.clj @@ -1,6 +1,7 @@ (ns puppetlabs.puppetdb.scf.partitioning "Handles all work related to database table partitioning" (:require + [clojure.string :refer [lower-case]] [puppetlabs.i18n.core :refer [trs]] [puppetlabs.puppetdb.jdbc :as jdbc] [schema.core :as s]) @@ -33,7 +34,7 @@ (defn date-suffix [date] (let [formatter (.withZone DateTimeFormatter/BASIC_ISO_DATE (ZoneId/of "UTC"))] - (.format date formatter))) + (lower-case (.format date formatter)))) (defn to-zoned-date-time [date] diff --git a/src/puppetlabs/puppetdb/scf/storage.clj b/src/puppetlabs/puppetdb/scf/storage.clj index 4f271fe81d..57e5b709af 100644 --- a/src/puppetlabs/puppetdb/scf/storage.clj +++ b/src/puppetlabs/puppetdb/scf/storage.clj @@ -1625,6 +1625,39 @@ (let [{current-db-version :version} (sutils/db-metadata)] (not (neg? (compare current-db-version pg14-db))))) +(defn finalize-pending-detach + "Finalize a previously failed detach operation. A partitioned table can + only have one partition pending detachment at any time." + [parent] + (let [pending (->> ["SELECT inhrelid::regclass AS child + FROM pg_catalog.pg_inherits + WHERE inhparent = ?::regclass AND inhdetachpending = true" + parent] + jdbc/query-to-vec + first + :child)] + (when pending + (log/info (trs "Finalizing detach for partition {0}" pending)) + (jdbc/do-commands (format "ALTER TABLE %s DETACH PARTITION %s FINALIZE" parent pending)) + (str pending)))) + +(defn find-stranded-partitions + "Identify tables that match the child format of a partitioned table (like reports_historical) + that are not present in the pg_inherits table. These partitions have been detached, but failed + to be deleted. + + Tables that are not partitioned will also not be in the pg_inherits table, so you MUST + write a child-format that does not match any non-partitioned tables. + + Returns a list of strings. Each string is a stranded partition that should be removed." + [child-format] + (->> [(str "SELECT tablename" + " FROM pg_tables WHERE tablename ~ ?" + " AND tablename NOT IN (SELECT inhrelid::regclass::text FROM pg_catalog.pg_inherits)") + child-format] + jdbc/query-to-vec + (map (comp str :tablename)))) + (defn prune-daily-partitions "Either detaches or drops obsolete day-oriented partitions older than the date. Deletes or detaches only the oldest such candidate if @@ -1650,14 +1683,27 @@ candidates (->> (partitioning/get-partition-names parent-table) (filter expired?) sort) - drop-one (fn [table] + detach (fn detach [parent child] + (jdbc/do-commands-outside-txn + (format "alter table %s detach partition %s concurrently" parent child))) + drop-one (fn drop-one [table] (update-lock-status status-key inc) (try! (if just-detach? - (jdbc/do-commands-outside-txn - (format "alter table %s detach partition %s concurrently" parent-table table)) + (let [ex (try + (detach parent-table table) + (catch SQLException ex + (if (= (jdbc/sql-state :not-in-prerequisite-state) (.getSQLState ex)) + ex + (throw ex))))] + (when (instance? SQLException ex) + (let [finalized-table (finalize-pending-detach parent-table)] + (when-not (= finalized-table table) + ;; Retry, unless the finalized partition detach was + ;; for the same table + (detach parent-table table))))) (jdbc/do-commands - (format "drop table if exists %s cascade" table))) + (format "drop table if exists %s cascade" table))) (finally (update-lock-status status-key dec)))) drop #(if incremental? @@ -1697,7 +1743,7 @@ "Drops the given set of tables. Will throw an SQLException termination if the operation takes much longer than PDB_GC_DAILY_PARTITION_DROP_LOCK_TIMEOUT_MS." [old-partition-tables update-lock-status status-key] - (let [drop #(doseq [table old-partition-tables] + (let [drop #(doseq [table (distinct old-partition-tables)] (try (update-lock-status status-key inc) (jdbc/do-commands @@ -1720,10 +1766,11 @@ update-lock-status :write-locking-resource-events)) ;; PG14+ (let [detached-tables - (detach-daily-partitions "resource_events_historical" date incremental? - update-lock-status :write-locking-resource-events)] + (detach-daily-partitions "resource_events_historical" date incremental? + update-lock-status :write-locking-resource-events) + stranded-tables (find-stranded-partitions "^resource_events_\\d\\d\\d\\d\\d\\d\\d\\dz$")] (jdbc/with-db-transaction [] - (drop-partition-tables! detached-tables + (drop-partition-tables! (concat detached-tables stranded-tables) update-lock-status :write-locking-resource-events))))) (defn delete-reports-older-than-in-pg-11! @@ -1767,10 +1814,12 @@ (detach-daily-partitions "resource_events_historical" effective-resource-events-ttl incremental? update-lock-status :write-locking-resource-events) + stranded-events-tables (find-stranded-partitions "^resource_events_\\d\\d\\d\\d\\d\\d\\d\\dz$") detached-report-tables (detach-daily-partitions "reports_historical" report-ttl incremental? update-lock-status - :write-locking-reports)] + :write-locking-reports) + stranded-reports-tables (find-stranded-partitions "^reports_\\d\\d\\d\\d\\d\\d\\d\\dz$")] ;; Now we can delete the partitions with less intrusive locking. (jdbc/with-db-transaction [] ;; Nothing should acquire locks on the detached tables, but to be safe, acquire @@ -1780,9 +1829,9 @@ ;; force a resource-events GC. prior to partitioning, this would have happened ;; via a cascade when the report was deleted, but now we just drop whole tables ;; of resource events. - (drop-partition-tables! detached-resource-event-tables + (drop-partition-tables! (concat detached-resource-event-tables stranded-events-tables) update-lock-status :write-locking-resource-events) - (drop-partition-tables! detached-report-tables + (drop-partition-tables! (concat detached-report-tables stranded-reports-tables) update-lock-status :write-locking-reports)))))) ;; A db version that is "allowed" but not supported is deprecated diff --git a/test/puppetlabs/puppetdb/cli/services_test.clj b/test/puppetlabs/puppetdb/cli/services_test.clj index 5b941c29a1..641ead8492 100644 --- a/test/puppetlabs/puppetdb/cli/services_test.clj +++ b/test/puppetlabs/puppetdb/cli/services_test.clj @@ -8,7 +8,8 @@ [puppetlabs.puppetdb.scf.partitioning :refer [create-resource-events-partition create-reports-partition - get-temporal-partitions]] + get-temporal-partitions] + :as part] [puppetlabs.trapperkeeper.testutils.logging :refer [with-log-output logs-matching @@ -592,7 +593,7 @@ (with-test-db (let [config (-> (create-temp-config) (assoc :database *db* :read-database *read-db*) - (assoc-in [:database :gc-interval] "0.01")) + (assoc-in [:database :gc-interval] "60")) store-report #(sync-command-post (svc-utils/pdb-cmd-url) example-certname "store report" @@ -620,7 +621,77 @@ :resource-events-ttl (time/parse-period "1d") :db-lock-status db-lock-status}) (is (= 1 (count (jdbc/query ["SELECT * FROM reports_latest"])))) - (is (empty? (jdbc/query ["SELECT * FROM reports_historical"]))))))))) + (is (empty? (jdbc/query ["SELECT * FROM reports_historical"]))) + (jdbc/do-commands "DELETE FROM reports")) + + ;; These tests are not applicable unless our Postgres version is new enough + ;; to support the concurrent partition detach feature. + (when (scf-store/detach-partitions-concurrently?) + (testing "a partition stuck in the pending state is finalized and removed" + (let [old-ts (-> 2 time/days time/ago) + partition-table (format "reports_%s" + (part/date-suffix (part/to-zoned-date-time (time/to-timestamp old-ts)))) + lock-acquired (promise) + partition-pending-detach (promise)] + (store-report (time/to-string old-ts)) + (store-report (to-string (now))) + + (future + ;; Create a query that will block the ACCESS EXCLUSIVE lock needed + ;; by the second transaction of the concurrent detach below + (jdbc/with-transacted-connection *read-db* + (jdbc/with-db-transaction [] + (jdbc/query [(format "select * from %s" partition-table)]) + (deliver lock-acquired partition-table) + + ;; wait for partition detach to fail + @partition-pending-detach))) + + ;; Wait until we are sure that the detach partition operation will be blocked + @lock-acquired + + (try + (jdbc/do-commands-outside-txn + "SET statement_timeout = 100" + (format "ALTER TABLE reports_historical DETACH PARTITION %s CONCURRENTLY" partition-table)) + (catch java.sql.SQLException _) + (finally + (deliver partition-pending-detach partition-table) + (jdbc/do-commands-outside-txn "SET statement_timeout = 0"))) + + (is (= [{:inhdetachpending true}] + (jdbc/query ["select inhdetachpending from pg_catalog.pg_inherits where inhparent = 'reports_historical'::regclass and inhrelid = ?::regclass" partition-table]))) + + (svcs/sweep-reports! *db* {:incremental? false + :report-ttl (time/parse-period "1d") + :resource-events-ttl (time/parse-period "1d") + :db-lock-status db-lock-status}) + + (is (empty? + (jdbc/query ["SELECT tablename FROM pg_tables WHERE tablename = ?" partition-table]))) + + (jdbc/do-commands "DELETE FROM reports"))) + + (testing "a detached partition that was not removed is cleaned up by gc" + (let [old-ts (-> 2 time/days time/ago) + partition-table (format "reports_%s" + (part/date-suffix (part/to-zoned-date-time (time/to-timestamp old-ts))))] + (store-report (time/to-string old-ts)) + (store-report (to-string (now))) + + ;; Strand the partition before calling GC + (jdbc/do-commands-outside-txn + (format "ALTER TABLE reports_historical DETACH PARTITION %s CONCURRENTLY" partition-table)) + + (svcs/sweep-reports! *db* {:incremental? false + :report-ttl (time/parse-period "1d") + :resource-events-ttl (time/parse-period "1d") + :db-lock-status db-lock-status}) + + (is (empty? + (jdbc/query ["SELECT tablename FROM pg_tables WHERE tablename = ?" partition-table]))) + + (jdbc/do-commands "DELETE FROM reports"))))))))) (deftest reports-analysis ;; For now, just test for the initial invocation