Skip to content

Commit

Permalink
Merge pull request #4014 from austb/gh-4013/main/handle-report-gc-fai…
Browse files Browse the repository at this point in the history
…lures

Handle failure cases of report gc
  • Loading branch information
rbrw authored Oct 21, 2024
2 parents 25e8e0f + 7d86914 commit 18fa7cc
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 20 deletions.
15 changes: 15 additions & 0 deletions documentation/release_notes_7.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,21 @@ canonical: "/puppetdb/latest/release_notes.html"

# PuppetDB: Release notes

## PuppetDB 7.20.1

Released TBD.

### Bug fixes

* Fixed an issue with report garbage collection where a partition would become
partially detached and block future garbage collection progress. Garbage
collection will now finalize the partition detach operation and remove the
table. ([GitHub #4013](https://github.com/puppetlabs/puppetdb/issues/4013))
* Fixed an issue with report garbage collection where a partition would be
detached, but the table was never deleted. Garbage collection will now
identify and clean-up these tables.
([GitHub #4013](https://github.com/puppetlabs/puppetdb/issues/4013))

## PuppetDB 7.20.0

Released October 22 2024
Expand Down
1 change: 1 addition & 0 deletions src/puppetlabs/puppetdb/jdbc.clj
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
(or ({:admin-shutdown "57P01"
:invalid-regular-expression "2201B"
:program-limit-exceeded "54000"
:not-in-prerequisite-state "55000"
:lock-not-available "55P03"
:query-canceled "57014"}
kw-name)
Expand Down
5 changes: 3 additions & 2 deletions src/puppetlabs/puppetdb/scf/partitioning.clj
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
(ns puppetlabs.puppetdb.scf.partitioning
"Handles all work related to database table partitioning"
(:require
[clojure.string :refer [lower-case]]
[puppetlabs.i18n.core :refer [trs]]
[puppetlabs.puppetdb.jdbc :as jdbc]
[schema.core :as s])
Expand Down Expand Up @@ -32,8 +33,8 @@

(defn date-suffix
[date]
(let [formatter (.withZone (DateTimeFormatter/BASIC_ISO_DATE) (ZoneId/of "UTC"))]
(.format date formatter)))
(let [formatter (.withZone DateTimeFormatter/BASIC_ISO_DATE (ZoneId/of "UTC"))]
(lower-case (.format date formatter))))

(defn to-zoned-date-time
[date]
Expand Down
79 changes: 64 additions & 15 deletions src/puppetlabs/puppetdb/scf/storage.clj
Original file line number Diff line number Diff line change
Expand Up @@ -1673,6 +1673,39 @@
(let [{current-db-version :version} (sutils/db-metadata)]
(not (neg? (compare current-db-version pg14-db)))))

(defn finalize-pending-detach
"Finalize a previously failed detach operation. A partitioned table can
only have one partition pending detachment at any time."
[parent]
(let [pending (->> ["SELECT inhrelid::regclass AS child
FROM pg_catalog.pg_inherits
WHERE inhparent = ?::regclass AND inhdetachpending = true"
parent]
jdbc/query-to-vec
first
:child)]
(when pending
(log/info (trs "Finalizing detach for partition {0}" pending))
(jdbc/do-commands (format "ALTER TABLE %s DETACH PARTITION %s FINALIZE" parent pending))
(str pending))))

(defn find-stranded-partitions
"Identify tables that match the child format of a partitioned table (like reports_historical)
that are not present in the pg_inherits table. These partitions have been detached, but failed
to be deleted.
Tables that are not partitioned will also not be in the pg_inherits table, so you MUST
write a child-format that does not match any non-partitioned tables.
Returns a list of strings. Each string is a stranded partition that should be removed."
[child-format]
(->> [(str "SELECT tablename"
" FROM pg_tables WHERE tablename ~ ?"
" AND tablename NOT IN (SELECT inhrelid::regclass::text FROM pg_catalog.pg_inherits)")
child-format]
jdbc/query-to-vec
(map (comp str :tablename))))

(defn prune-daily-partitions
"Either detaches or drops obsolete day-oriented partitions
older than the date. Deletes or detaches only the oldest such candidate if
Expand All @@ -1698,14 +1731,27 @@
candidates (->> (partitioning/get-partition-names table-prefix)
(filter expired?)
sort)
drop-one (fn [table]
detach (fn detach [parent child]
(jdbc/do-commands-outside-txn
(format "alter table %s detach partition %s concurrently" parent child)))
drop-one (fn drop-one [table]
(update-lock-status status-key inc)
(try!
(if just-detach?
(jdbc/do-commands-outside-txn
(format "alter table %s detach partition %s concurrently" table-prefix table))
(let [ex (try
(detach table-prefix table)
(catch SQLException ex
(if (= (jdbc/sql-state :not-in-prerequisite-state) (.getSQLState ex))
ex
(throw ex))))]
(when (instance? SQLException ex)
(let [finalized-table (finalize-pending-detach table-prefix)]
(when-not (= finalized-table table)
;; Retry, unless the finalized partition detach was
;; for the same table
(detach table-prefix table)))))
(jdbc/do-commands
(format "drop table if exists %s cascade" table)))
(format "drop table if exists %s cascade" table)))
(finally
(update-lock-status status-key dec))))
drop #(if incremental?
Expand Down Expand Up @@ -1745,7 +1791,7 @@
"Drops the given set of tables. Will throw an SQLException termination if the
operation takes much longer than PDB_GC_DAILY_PARTITION_DROP_LOCK_TIMEOUT_MS."
[old-partition-tables update-lock-status status-key]
(let [drop #(doseq [table old-partition-tables]
(let [drop #(doseq [table (distinct old-partition-tables)]
(try
(update-lock-status status-key inc)
(jdbc/do-commands
Expand All @@ -1769,9 +1815,10 @@
;; PG14+
(let [detached-tables
(detach-daily-partitions "resource_events" date incremental?
update-lock-status :write-locking-resource-events)]
update-lock-status :write-locking-resource-events)
stranded-tables (find-stranded-partitions "^resource_events_\\d\\d\\d\\d\\d\\d\\d\\dz$")]
(jdbc/with-db-transaction []
(drop-partition-tables! detached-tables
(drop-partition-tables! (concat detached-tables stranded-tables)
update-lock-status :write-locking-resource-events)))))

(defn cleanup-dropped-report-certnames
Expand Down Expand Up @@ -1824,13 +1871,15 @@
;; PG14+
;; Detach partition concurrently must take place outside of a transaction.
(let [detached-resource-event-tables
(detach-daily-partitions "resource_events" effective-resource-events-ttl
incremental? update-lock-status
:write-locking-resource-events)
(detach-daily-partitions "resource_events" effective-resource-events-ttl
incremental? update-lock-status
:write-locking-resource-events)
stranded-events-tables (find-stranded-partitions "^resource_events_\\d\\d\\d\\d\\d\\d\\d\\dz$")
detached-report-tables
(detach-daily-partitions "reports" report-ttl
incremental? update-lock-status
:write-locking-reports)]
(detach-daily-partitions "reports" report-ttl
incremental? update-lock-status
:write-locking-reports)
stranded-reports-tables (find-stranded-partitions "^reports_\\d\\d\\d\\d\\d\\d\\d\\dz$")]
;; Now we can delete the partitions with less intrusive locking.
(jdbc/with-db-transaction []
;; Nothing should acquire locks on the detached tables, but to be safe, acquire
Expand All @@ -1840,9 +1889,9 @@
;; force a resource-events GC. prior to partitioning, this would have happened
;; via a cascade when the report was deleted, but now we just drop whole tables
;; of resource events.
(drop-partition-tables! detached-resource-event-tables
(drop-partition-tables! (concat detached-resource-event-tables stranded-events-tables)
update-lock-status :write-locking-resource-events)
(drop-partition-tables! detached-report-tables
(drop-partition-tables! (concat detached-report-tables stranded-reports-tables)
update-lock-status :write-locking-reports)
;; since we cannot cascade back to the certnames table anymore, go clean up
;; the latest_report_id column after a GC.
Expand Down
76 changes: 73 additions & 3 deletions test/puppetlabs/puppetdb/cli/services_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
[puppetlabs.puppetdb.command.constants :as cmd-consts]
[puppetlabs.puppetdb.lint :refer [ignore-value]]
[puppetlabs.puppetdb.scf.partitioning
:refer [get-temporal-partitions]]
:refer [get-temporal-partitions]
:as part]
[puppetlabs.trapperkeeper.testutils.logging
:refer [with-log-output
logs-matching
Expand Down Expand Up @@ -583,7 +584,7 @@
(with-test-db
(let [config (-> (create-temp-config)
(assoc :database *db* :read-database *read-db*)
(assoc-in [:database :gc-interval] "0.01"))
(assoc-in [:database :gc-interval] "60"))
store-report #(sync-command-post (svc-utils/pdb-cmd-url)
example-certname
"store report"
Expand All @@ -607,4 +608,73 @@
:report-ttl (time/parse-period "1d")
:resource-events-ttl (time/parse-period "1d")
:db-lock-status db-lock-status})
(is (empty? (jdbc/query ["SELECT * FROM reports"])))))))))
(is (empty? (jdbc/query ["SELECT * FROM reports"]))))

;; These tests are not applicable unless our Postgres version is new enough
;; to support the concurrent partition detach feature.
(when (scf-store/detach-partitions-concurrently?)
(testing "a partition stuck in the pending state is finalized and removed"
(let [old-ts (-> 2 time/days time/ago)
partition-table (format "reports_%s"
(part/date-suffix (part/to-zoned-date-time (time/to-timestamp old-ts))))
lock-acquired (promise)
partition-pending-detach (promise)]
(store-report (time/to-string old-ts))
(store-report (to-string (now)))

(future
;; Create a query that will block the ACCESS EXCLUSIVE lock needed
;; by the second transaction of the concurrent detach below
(jdbc/with-transacted-connection *read-db*
(jdbc/with-db-transaction []
(jdbc/query [(format "select * from %s" partition-table)])
(deliver lock-acquired partition-table)

;; wait for partition detach to fail
@partition-pending-detach)))

;; Wait until we are sure that the detach partition operation will be blocked
@lock-acquired

(try
(jdbc/do-commands-outside-txn
"SET statement_timeout = 100"
(format "ALTER TABLE reports DETACH PARTITION %s CONCURRENTLY" partition-table))
(catch java.sql.SQLException _)
(finally
(deliver partition-pending-detach partition-table)
(jdbc/do-commands-outside-txn "SET statement_timeout = 0")))

(is (= [{:inhdetachpending true}]
(jdbc/query ["select inhdetachpending from pg_catalog.pg_inherits where inhparent = 'reports'::regclass and inhrelid = ?::regclass" partition-table])))

(svcs/sweep-reports! *db* {:incremental? false
:report-ttl (time/parse-period "1d")
:resource-events-ttl (time/parse-period "1d")
:db-lock-status db-lock-status})

(is (empty?
(jdbc/query ["SELECT tablename FROM pg_tables WHERE tablename = ?" partition-table])))

(jdbc/do-commands "DELETE FROM reports")))

(testing "a detached partition that was not removed is cleaned up by gc"
(let [old-ts (-> 2 time/days time/ago)
partition-table (format "reports_%s"
(part/date-suffix (part/to-zoned-date-time (time/to-timestamp old-ts))))]
(store-report (time/to-string old-ts))
(store-report (to-string (now)))

;; Strand the partition before calling GC
(jdbc/do-commands-outside-txn
(format "ALTER TABLE reports DETACH PARTITION %s CONCURRENTLY" partition-table))

(svcs/sweep-reports! *db* {:incremental? false
:report-ttl (time/parse-period "1d")
:resource-events-ttl (time/parse-period "1d")
:db-lock-status db-lock-status})

(is (empty?
(jdbc/query ["SELECT tablename FROM pg_tables WHERE tablename = ?" partition-table])))

(jdbc/do-commands "DELETE FROM reports")))))))))

0 comments on commit 18fa7cc

Please sign in to comment.