Workaround for pg_dump non-idempotent behaviour

Problem ======= * `pg_dump` may dump the data rows in arbitrary order. This messes with the final hash of the dump file, even though the data after restoring would be same. This creates additional uploads with no value Solution ======== * Utilize `pgdump-sort` script by `tigra564`, which attempts to sort data in the dump file specifically for hashing purposes.
2022-12-26 12:35:38 -08:00
parent fddd8e84dd
commit 9702287a92
3 changed files with 287 additions and 7 deletions
--- a/README.md
+++ b/README.md
@@ -4,10 +4,13 @@
 * systemd
 * [AWS client](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
 * python3
 	* docopt
 * [pgdump-sort](https://github.com/tigra564/pgdump-sort) (Bundled)
 ### Install
 * Run `aws configure`
 * Symlink unit files to `/etc/systemd/system`
 * Run `systemctl enable` on units and slices
-* Run `systemctl start` on timers
+* Run `systemctl start` on timers 
--- a/backup-bitwarden.sh
+++ b/backup-bitwarden.sh
@@ -2,15 +2,20 @@
 set -euo pipefail
 scriptpath="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 dumpfile="dump.sql"
 tmpdir="$(mktemp -d -p "${PWD}")"
 function check_for_hash() {
    local ret=0
-    echo "Checking if hash ${1} is present"
+    echo -n "Checking if hash ${1} is present: "
    aws s3 ls "s3://${S3_BUCKET}/sums/${1}" || ret=$?
-    echo "Returned: ${ret}"
+		case "$ret" in
 				0) echo "Yes." ;;
 				*) echo "No." ;;
 		esac
    return $ret
 }
@@ -18,6 +23,7 @@ function create_and_upload() {
    local sum=$1
    local backup_file
    backup_file="$(date +%Y/%m/backup-%d-%H-%M-%S.tar.gz)"
 		echo "Uploading ${backup_file}"
    tar -zc . | aws s3 cp - "s3://${S3_BUCKET}/${backup_file}"
    aws s3api put-object --bucket "${S3_BUCKET}" --key "sums/${sum}"
 }
@@ -29,16 +35,17 @@ pushd "${tmpdir}"
 rm -rf "${dumpfile}"
 touch "${dumpfile}"
 chmod ugo+w "${dumpfile}"
-sudo -u postgres -- pg_dump --no-owner --no-privileges --clean --if-exists --quote-all-identifiers "${DATABASE_URL}" -F plain -f "${dumpfile}"
+sudo -u postgres -- pg_dump --no-owner --no-privileges --clean --if-exists --quote-all-identifiers "${DATABASE_URL}" -f "${dumpfile}"
 "${scriptpath}/pgdump-sort" "${dumpfile}" "sorted.sql"
 cp -r "${ROOT_DIR}/data" "./data"
 cp "${ROOT_DIR}/.env" "./.env"
 cp "${ROOT_DIR}/bitwarden.exceede.com.conf" "./bitwarden.exceede.com.conf"
-# remove icon_cache
+sum=$(find . -type f -not -name "${dumpfile}" -and -not -path "./data/icon_cache/*" -exec md5sum {} + | LC_ALL=C sort | md5sum | cut -d ' ' -f 1)
 rm -rf ./data/icon_cache
-sum=$(find . -type f -exec md5sum {} + | LC_ALL=C sort | md5sum | cut -d ' ' -f 1)
+rm sorted.sql
 check_for_hash "$sum" || create_and_upload "${sum}"
--- a/270
+++ b/270
@@ -0,0 +1,270 @@
 #!/usr/bin/python3
 """
 Usage: pgdump-sort [options] <dump> [<sorted-dump>]
       pgdump-sort -h | --help | --version
 Options:
  -n          Sort entries in natural order (requires python3 module natsort)
  -h --help   Show this usage and exit
  --version   Show version and exit
 """
 from docopt import docopt
 import os
 import sys
 import re
 import tempfile
 import shutil
 from enum import Enum
 _has_natsort = False
 try:
 	import natsort
 	_has_natsort = True
 except ModuleNotFoundError:
 	pass
 version='0.2'
 RE_OBJDESC = re.compile(
 	'-- (?P<isdata>(Data for )?)Name: (?P<name>.*?); '
 	'Type: (?P<type>.*?); '
 	'Schema: (?P<schema>.*?); '
 	'Owner: (?P<owner>.*)'
 )
 RE_SEQSET = re.compile("SELECT pg_catalog.setval\('(?P<name>.*?)'.*")
 class state(Enum):
 	EMPTY = 1
 	SETTINGS = 2
 	DEF = 3
 	DATA = 4
 	COPY = 5
 	INSERT = 6
 	SEQSET = 7
 class buffer(list):
 	destdir = None
 	st = state.EMPTY
 	fname = None
 	title = None
 	def __init__(self, destdir, fsorted, fsorted_args):
 		self.destdir = destdir
 		self.fsorted = fsorted
 		self.fsorted_args = fsorted_args
 	def flushto(self, st, fname, title):
 		#print("EVICTING", self.st, "to", self.fname, "New state:", st)
 		# Trim ellipsing comments and empty lines
 		while self and ('' == self[0] or self[0].startswith('--')):
 			del self[0]
 		while self and ('' == self[-1] or self[-1].startswith('--')):
 			del self[-1]
 		if len(self):
 			if self.st in (state.COPY, state.INSERT):
 				self[:] = sort_datalines(self, self.fsorted, self.fsorted_args)
 			self[:] = [
 				'--',
 				self.title,
 				'--',
 				'',
 			] + self
 			with open(os.path.join(self.destdir, self.fname), "w") as out:
 				out.writelines([l + '\n' for l in self])
 		self.clear()
 		self.st = st
 		self.fname = fname
 		self.title = title
 	def proc_comment(self, line):
 		# Returns True if the line is a comment, i.e. it has been processed
 		if not line.startswith('--'):
 			return False
 		m = re.match(RE_OBJDESC, line)
 		if not m:
 			return True
 		if 'SEQUENCE SET' == m.group('type'):
 			st = state.SEQSET
 		elif m.group('isdata'):
 			st = state.DATA
 		else:
 			st = state.DEF
 		fname = '%d-%s-%s-%s-%s' % (
 			st.value,
 			m.group('type'),
 			m.group('schema'),
 			m.group('name'),
 			m.group('owner')
 		)
 		if 255 < len(fname):
 			fname = fname[:255-3] + "..."
 		self.flushto(st, fname, line)
 		return True
 def sort_datalines(lines, fsorted, fsorted_args):
 	pre = []
 	data = []
 	post = []
 	state = 0
 	ptr = pre
 	isins = False
 	for line in lines:
 		if 0 == state:
 			if line.startswith('COPY'):
 				ptr.append(line)
 				ptr = data
 				state = 1
 			elif line.startswith('INSERT'):
 				ptr = data
 				ptr.append(line)
 				isins = True
 				state = 1
 			else:
 				ptr.append(line)
 		elif 1 == state:
 			if isins and '\n' == line or not isins and '\\.\n' == line:
 				ptr = post
 				ptr.append(line)
 				status = 2
 			else:
 				ptr.append(line)
 		else:
 			ptr.append(line)
 	return pre + fsorted(data, **fsorted_args) + post
 def dissect(dump, destdir, fsorted, fsorted_args):
 	buf = buffer(destdir, fsorted, fsorted_args)
 	for line in open(dump):
 		# trim trailing newline (if any)
 		if '\n' == line[-1]:
 			line = line[:-1]
 		#print(buf.st.name.ljust(10), "\t[%s]" % line)
 		if buf.st == state.EMPTY:
 			if buf.proc_comment(line):
 				pass
 			elif '' == line:
 				pass
 			else:
 				buf.flushto(state.SETTINGS, "%d-%s" % (state.SETTINGS.value, "SETTINGS"),
 					'-- Sorted PostgreSQL database dump')
 				buf.append(line)
 		elif buf.st in (state.SETTINGS, state.DEF, state.INSERT):
 			if buf.proc_comment(line):
 				pass
 			else:
 				buf.append(line)
 		elif buf.st == state.DATA:
 			if line.startswith('COPY '):
 				buf.st = state.COPY
 			elif line.startswith('INSERT '):
 				buf.st = state.INSERT
 			buf.append(line)
 		elif buf.st == state.COPY:
 			buf.append(line)
 			if r'\.' == line:
 				buf.flushto(state.EMPTY, None, None)
 		elif buf.st == state.SEQSET:
 			if buf.proc_comment(line):
 				pass
 			elif line.startswith('SELECT pg_catalog.setval'):
 				m = re.match(RE_SEQSET, line)
 				line = "SELECT pg_catalog.setval('%s', 1, false);" % m.group('name')
 				buf.append(line)
 			else:
 				buf.append(line)
 		else:
 			print("This should not happen")
 	buf.flushto(state.EMPTY, None, None)
 def recombine(destdir, dump, fsorted, fsorted_args):
 	out = open(dump, 'w')
 	first = True
 	sorted_files = fsorted(os.listdir(destdir), **fsorted_args)
 	for fname in sorted_files:
 		if first:
 			first = False
 		else:
 			out.write('\n')
 		with open(os.path.join(destdir, fname)) as f:
 			out.writelines(f.readlines())
 	if sorted_files:
 		out.writelines([
 			'\n',
 			'--\n',
 			'-- Sorted dump complete\n',
 			'--\n',
 		])
 	out.close()
 def pgdump_sort(dump, sdump, fsorted=sorted, **fsorted_args):
 	destdir = tempfile.mkdtemp(suffix=os.path.basename(dump), prefix='pgdump-sort')
 	try:
 		dissect(dump, destdir, fsorted, fsorted_args)
 		recombine(destdir, sdump, fsorted, fsorted_args)
 	finally:
 		shutil.rmtree(destdir)
 natsort_error = \
 """In order to use natural sort you need to install natsort module:
   pip install natsort
 """
 if __name__ == '__main__':
 	args = docopt(__doc__, version=version)
 	dump = args['<dump>']
 	sdump = args['<sorted-dump>']
 	if sdump is None:
 		sdump = re.sub(r'\.sql$', '', dump) + '-sorted.sql'
 	if args['-n']:
 		if _has_natsort:
 			fsorted = natsort.natsorted
 			fsorted_args = {'alg': natsort.ns.IGNORECASE}
 		else:
 			print(natsort_error, file=sys.stderr)
 			exit(1)
 	else:
 		fsorted = sorted
 		fsorted_args = {}
 	pgdump_sort(dump, sdump, fsorted, **fsorted_args)