Workaround for pg_dump non-idempotent behaviour

Problem
=======

* `pg_dump` may dump the data rows in arbitrary order. This messes with
  the final hash of the dump file, even though the data after restoring
  would be same. This creates additional uploads with no value

Solution
========

* Utilize `pgdump-sort` script by `tigra564`, which attempts to sort
  data in the dump file specifically for hashing purposes.
This commit is contained in:
2022-12-26 12:35:38 -08:00
parent fddd8e84dd
commit 9702287a92
3 changed files with 287 additions and 7 deletions

View File

@@ -4,6 +4,9 @@
* systemd
* [AWS client](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
* python3
* docopt
* [pgdump-sort](https://github.com/tigra564/pgdump-sort) (Bundled)
### Install

View File

@@ -2,15 +2,20 @@
set -euo pipefail
scriptpath="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
dumpfile="dump.sql"
tmpdir="$(mktemp -d -p "${PWD}")"
function check_for_hash() {
local ret=0
echo "Checking if hash ${1} is present"
echo -n "Checking if hash ${1} is present: "
aws s3 ls "s3://${S3_BUCKET}/sums/${1}" || ret=$?
echo "Returned: ${ret}"
case "$ret" in
0) echo "Yes." ;;
*) echo "No." ;;
esac
return $ret
}
@@ -18,6 +23,7 @@ function create_and_upload() {
local sum=$1
local backup_file
backup_file="$(date +%Y/%m/backup-%d-%H-%M-%S.tar.gz)"
echo "Uploading ${backup_file}"
tar -zc . | aws s3 cp - "s3://${S3_BUCKET}/${backup_file}"
aws s3api put-object --bucket "${S3_BUCKET}" --key "sums/${sum}"
}
@@ -29,16 +35,17 @@ pushd "${tmpdir}"
rm -rf "${dumpfile}"
touch "${dumpfile}"
chmod ugo+w "${dumpfile}"
sudo -u postgres -- pg_dump --no-owner --no-privileges --clean --if-exists --quote-all-identifiers "${DATABASE_URL}" -F plain -f "${dumpfile}"
sudo -u postgres -- pg_dump --no-owner --no-privileges --clean --if-exists --quote-all-identifiers "${DATABASE_URL}" -f "${dumpfile}"
"${scriptpath}/pgdump-sort" "${dumpfile}" "sorted.sql"
cp -r "${ROOT_DIR}/data" "./data"
cp "${ROOT_DIR}/.env" "./.env"
cp "${ROOT_DIR}/bitwarden.exceede.com.conf" "./bitwarden.exceede.com.conf"
# remove icon_cache
rm -rf ./data/icon_cache
sum=$(find . -type f -not -name "${dumpfile}" -and -not -path "./data/icon_cache/*" -exec md5sum {} + | LC_ALL=C sort | md5sum | cut -d ' ' -f 1)
sum=$(find . -type f -exec md5sum {} + | LC_ALL=C sort | md5sum | cut -d ' ' -f 1)
rm sorted.sql
check_for_hash "$sum" || create_and_upload "${sum}"

270
pgdump-sort Executable file
View File

@@ -0,0 +1,270 @@
#!/usr/bin/python3
"""
Usage: pgdump-sort [options] <dump> [<sorted-dump>]
pgdump-sort -h | --help | --version
Options:
-n Sort entries in natural order (requires python3 module natsort)
-h --help Show this usage and exit
--version Show version and exit
"""
from docopt import docopt
import os
import sys
import re
import tempfile
import shutil
from enum import Enum
_has_natsort = False
try:
import natsort
_has_natsort = True
except ModuleNotFoundError:
pass
version='0.2'
RE_OBJDESC = re.compile(
'-- (?P<isdata>(Data for )?)Name: (?P<name>.*?); '
'Type: (?P<type>.*?); '
'Schema: (?P<schema>.*?); '
'Owner: (?P<owner>.*)'
)
RE_SEQSET = re.compile("SELECT pg_catalog.setval\('(?P<name>.*?)'.*")
class state(Enum):
EMPTY = 1
SETTINGS = 2
DEF = 3
DATA = 4
COPY = 5
INSERT = 6
SEQSET = 7
class buffer(list):
destdir = None
st = state.EMPTY
fname = None
title = None
def __init__(self, destdir, fsorted, fsorted_args):
self.destdir = destdir
self.fsorted = fsorted
self.fsorted_args = fsorted_args
def flushto(self, st, fname, title):
#print("EVICTING", self.st, "to", self.fname, "New state:", st)
# Trim ellipsing comments and empty lines
while self and ('' == self[0] or self[0].startswith('--')):
del self[0]
while self and ('' == self[-1] or self[-1].startswith('--')):
del self[-1]
if len(self):
if self.st in (state.COPY, state.INSERT):
self[:] = sort_datalines(self, self.fsorted, self.fsorted_args)
self[:] = [
'--',
self.title,
'--',
'',
] + self
with open(os.path.join(self.destdir, self.fname), "w") as out:
out.writelines([l + '\n' for l in self])
self.clear()
self.st = st
self.fname = fname
self.title = title
def proc_comment(self, line):
# Returns True if the line is a comment, i.e. it has been processed
if not line.startswith('--'):
return False
m = re.match(RE_OBJDESC, line)
if not m:
return True
if 'SEQUENCE SET' == m.group('type'):
st = state.SEQSET
elif m.group('isdata'):
st = state.DATA
else:
st = state.DEF
fname = '%d-%s-%s-%s-%s' % (
st.value,
m.group('type'),
m.group('schema'),
m.group('name'),
m.group('owner')
)
if 255 < len(fname):
fname = fname[:255-3] + "..."
self.flushto(st, fname, line)
return True
def sort_datalines(lines, fsorted, fsorted_args):
pre = []
data = []
post = []
state = 0
ptr = pre
isins = False
for line in lines:
if 0 == state:
if line.startswith('COPY'):
ptr.append(line)
ptr = data
state = 1
elif line.startswith('INSERT'):
ptr = data
ptr.append(line)
isins = True
state = 1
else:
ptr.append(line)
elif 1 == state:
if isins and '\n' == line or not isins and '\\.\n' == line:
ptr = post
ptr.append(line)
status = 2
else:
ptr.append(line)
else:
ptr.append(line)
return pre + fsorted(data, **fsorted_args) + post
def dissect(dump, destdir, fsorted, fsorted_args):
buf = buffer(destdir, fsorted, fsorted_args)
for line in open(dump):
# trim trailing newline (if any)
if '\n' == line[-1]:
line = line[:-1]
#print(buf.st.name.ljust(10), "\t[%s]" % line)
if buf.st == state.EMPTY:
if buf.proc_comment(line):
pass
elif '' == line:
pass
else:
buf.flushto(state.SETTINGS, "%d-%s" % (state.SETTINGS.value, "SETTINGS"),
'-- Sorted PostgreSQL database dump')
buf.append(line)
elif buf.st in (state.SETTINGS, state.DEF, state.INSERT):
if buf.proc_comment(line):
pass
else:
buf.append(line)
elif buf.st == state.DATA:
if line.startswith('COPY '):
buf.st = state.COPY
elif line.startswith('INSERT '):
buf.st = state.INSERT
buf.append(line)
elif buf.st == state.COPY:
buf.append(line)
if r'\.' == line:
buf.flushto(state.EMPTY, None, None)
elif buf.st == state.SEQSET:
if buf.proc_comment(line):
pass
elif line.startswith('SELECT pg_catalog.setval'):
m = re.match(RE_SEQSET, line)
line = "SELECT pg_catalog.setval('%s', 1, false);" % m.group('name')
buf.append(line)
else:
buf.append(line)
else:
print("This should not happen")
buf.flushto(state.EMPTY, None, None)
def recombine(destdir, dump, fsorted, fsorted_args):
out = open(dump, 'w')
first = True
sorted_files = fsorted(os.listdir(destdir), **fsorted_args)
for fname in sorted_files:
if first:
first = False
else:
out.write('\n')
with open(os.path.join(destdir, fname)) as f:
out.writelines(f.readlines())
if sorted_files:
out.writelines([
'\n',
'--\n',
'-- Sorted dump complete\n',
'--\n',
])
out.close()
def pgdump_sort(dump, sdump, fsorted=sorted, **fsorted_args):
destdir = tempfile.mkdtemp(suffix=os.path.basename(dump), prefix='pgdump-sort')
try:
dissect(dump, destdir, fsorted, fsorted_args)
recombine(destdir, sdump, fsorted, fsorted_args)
finally:
shutil.rmtree(destdir)
natsort_error = \
"""In order to use natural sort you need to install natsort module:
pip install natsort
"""
if __name__ == '__main__':
args = docopt(__doc__, version=version)
dump = args['<dump>']
sdump = args['<sorted-dump>']
if sdump is None:
sdump = re.sub(r'\.sql$', '', dump) + '-sorted.sql'
if args['-n']:
if _has_natsort:
fsorted = natsort.natsorted
fsorted_args = {'alg': natsort.ns.IGNORECASE}
else:
print(natsort_error, file=sys.stderr)
exit(1)
else:
fsorted = sorted
fsorted_args = {}
pgdump_sort(dump, sdump, fsorted, **fsorted_args)