Workaround for pg_dump non-idempotent behaviour
Problem ======= * `pg_dump` may dump the data rows in arbitrary order. This messes with the final hash of the dump file, even though the data after restoring would be same. This creates additional uploads with no value Solution ======== * Utilize `pgdump-sort` script by `tigra564`, which attempts to sort data in the dump file specifically for hashing purposes.
This commit is contained in:
@@ -4,10 +4,13 @@
|
|||||||
|
|
||||||
* systemd
|
* systemd
|
||||||
* [AWS client](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
|
* [AWS client](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
|
||||||
|
* python3
|
||||||
|
* docopt
|
||||||
|
* [pgdump-sort](https://github.com/tigra564/pgdump-sort) (Bundled)
|
||||||
|
|
||||||
### Install
|
### Install
|
||||||
|
|
||||||
* Run `aws configure`
|
* Run `aws configure`
|
||||||
* Symlink unit files to `/etc/systemd/system`
|
* Symlink unit files to `/etc/systemd/system`
|
||||||
* Run `systemctl enable` on units and slices
|
* Run `systemctl enable` on units and slices
|
||||||
* Run `systemctl start` on timers
|
* Run `systemctl start` on timers
|
||||||
|
|||||||
@@ -2,15 +2,20 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
scriptpath="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
|
||||||
|
|
||||||
dumpfile="dump.sql"
|
dumpfile="dump.sql"
|
||||||
|
|
||||||
tmpdir="$(mktemp -d -p "${PWD}")"
|
tmpdir="$(mktemp -d -p "${PWD}")"
|
||||||
|
|
||||||
function check_for_hash() {
|
function check_for_hash() {
|
||||||
local ret=0
|
local ret=0
|
||||||
echo "Checking if hash ${1} is present"
|
echo -n "Checking if hash ${1} is present: "
|
||||||
aws s3 ls "s3://${S3_BUCKET}/sums/${1}" || ret=$?
|
aws s3 ls "s3://${S3_BUCKET}/sums/${1}" || ret=$?
|
||||||
echo "Returned: ${ret}"
|
case "$ret" in
|
||||||
|
0) echo "Yes." ;;
|
||||||
|
*) echo "No." ;;
|
||||||
|
esac
|
||||||
return $ret
|
return $ret
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -18,6 +23,7 @@ function create_and_upload() {
|
|||||||
local sum=$1
|
local sum=$1
|
||||||
local backup_file
|
local backup_file
|
||||||
backup_file="$(date +%Y/%m/backup-%d-%H-%M-%S.tar.gz)"
|
backup_file="$(date +%Y/%m/backup-%d-%H-%M-%S.tar.gz)"
|
||||||
|
echo "Uploading ${backup_file}"
|
||||||
tar -zc . | aws s3 cp - "s3://${S3_BUCKET}/${backup_file}"
|
tar -zc . | aws s3 cp - "s3://${S3_BUCKET}/${backup_file}"
|
||||||
aws s3api put-object --bucket "${S3_BUCKET}" --key "sums/${sum}"
|
aws s3api put-object --bucket "${S3_BUCKET}" --key "sums/${sum}"
|
||||||
}
|
}
|
||||||
@@ -29,16 +35,17 @@ pushd "${tmpdir}"
|
|||||||
rm -rf "${dumpfile}"
|
rm -rf "${dumpfile}"
|
||||||
touch "${dumpfile}"
|
touch "${dumpfile}"
|
||||||
chmod ugo+w "${dumpfile}"
|
chmod ugo+w "${dumpfile}"
|
||||||
sudo -u postgres -- pg_dump --no-owner --no-privileges --clean --if-exists --quote-all-identifiers "${DATABASE_URL}" -F plain -f "${dumpfile}"
|
sudo -u postgres -- pg_dump --no-owner --no-privileges --clean --if-exists --quote-all-identifiers "${DATABASE_URL}" -f "${dumpfile}"
|
||||||
|
|
||||||
|
"${scriptpath}/pgdump-sort" "${dumpfile}" "sorted.sql"
|
||||||
|
|
||||||
cp -r "${ROOT_DIR}/data" "./data"
|
cp -r "${ROOT_DIR}/data" "./data"
|
||||||
cp "${ROOT_DIR}/.env" "./.env"
|
cp "${ROOT_DIR}/.env" "./.env"
|
||||||
cp "${ROOT_DIR}/bitwarden.exceede.com.conf" "./bitwarden.exceede.com.conf"
|
cp "${ROOT_DIR}/bitwarden.exceede.com.conf" "./bitwarden.exceede.com.conf"
|
||||||
|
|
||||||
# remove icon_cache
|
sum=$(find . -type f -not -name "${dumpfile}" -and -not -path "./data/icon_cache/*" -exec md5sum {} + | LC_ALL=C sort | md5sum | cut -d ' ' -f 1)
|
||||||
rm -rf ./data/icon_cache
|
|
||||||
|
|
||||||
sum=$(find . -type f -exec md5sum {} + | LC_ALL=C sort | md5sum | cut -d ' ' -f 1)
|
rm sorted.sql
|
||||||
|
|
||||||
check_for_hash "$sum" || create_and_upload "${sum}"
|
check_for_hash "$sum" || create_and_upload "${sum}"
|
||||||
|
|
||||||
|
|||||||
270
pgdump-sort
Executable file
270
pgdump-sort
Executable file
@@ -0,0 +1,270 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
Usage: pgdump-sort [options] <dump> [<sorted-dump>]
|
||||||
|
pgdump-sort -h | --help | --version
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-n Sort entries in natural order (requires python3 module natsort)
|
||||||
|
|
||||||
|
-h --help Show this usage and exit
|
||||||
|
--version Show version and exit
|
||||||
|
"""
|
||||||
|
|
||||||
|
from docopt import docopt
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
_has_natsort = False
|
||||||
|
try:
|
||||||
|
import natsort
|
||||||
|
_has_natsort = True
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
version='0.2'
|
||||||
|
|
||||||
|
|
||||||
|
RE_OBJDESC = re.compile(
|
||||||
|
'-- (?P<isdata>(Data for )?)Name: (?P<name>.*?); '
|
||||||
|
'Type: (?P<type>.*?); '
|
||||||
|
'Schema: (?P<schema>.*?); '
|
||||||
|
'Owner: (?P<owner>.*)'
|
||||||
|
)
|
||||||
|
RE_SEQSET = re.compile("SELECT pg_catalog.setval\('(?P<name>.*?)'.*")
|
||||||
|
|
||||||
|
|
||||||
|
class state(Enum):
|
||||||
|
EMPTY = 1
|
||||||
|
SETTINGS = 2
|
||||||
|
DEF = 3
|
||||||
|
DATA = 4
|
||||||
|
COPY = 5
|
||||||
|
INSERT = 6
|
||||||
|
SEQSET = 7
|
||||||
|
|
||||||
|
|
||||||
|
class buffer(list):
|
||||||
|
destdir = None
|
||||||
|
st = state.EMPTY
|
||||||
|
fname = None
|
||||||
|
title = None
|
||||||
|
|
||||||
|
def __init__(self, destdir, fsorted, fsorted_args):
|
||||||
|
self.destdir = destdir
|
||||||
|
self.fsorted = fsorted
|
||||||
|
self.fsorted_args = fsorted_args
|
||||||
|
|
||||||
|
def flushto(self, st, fname, title):
|
||||||
|
#print("EVICTING", self.st, "to", self.fname, "New state:", st)
|
||||||
|
|
||||||
|
# Trim ellipsing comments and empty lines
|
||||||
|
while self and ('' == self[0] or self[0].startswith('--')):
|
||||||
|
del self[0]
|
||||||
|
while self and ('' == self[-1] or self[-1].startswith('--')):
|
||||||
|
del self[-1]
|
||||||
|
|
||||||
|
if len(self):
|
||||||
|
if self.st in (state.COPY, state.INSERT):
|
||||||
|
self[:] = sort_datalines(self, self.fsorted, self.fsorted_args)
|
||||||
|
|
||||||
|
self[:] = [
|
||||||
|
'--',
|
||||||
|
self.title,
|
||||||
|
'--',
|
||||||
|
'',
|
||||||
|
] + self
|
||||||
|
|
||||||
|
with open(os.path.join(self.destdir, self.fname), "w") as out:
|
||||||
|
out.writelines([l + '\n' for l in self])
|
||||||
|
|
||||||
|
self.clear()
|
||||||
|
self.st = st
|
||||||
|
self.fname = fname
|
||||||
|
self.title = title
|
||||||
|
|
||||||
|
|
||||||
|
def proc_comment(self, line):
|
||||||
|
# Returns True if the line is a comment, i.e. it has been processed
|
||||||
|
if not line.startswith('--'):
|
||||||
|
return False
|
||||||
|
|
||||||
|
m = re.match(RE_OBJDESC, line)
|
||||||
|
if not m:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if 'SEQUENCE SET' == m.group('type'):
|
||||||
|
st = state.SEQSET
|
||||||
|
elif m.group('isdata'):
|
||||||
|
st = state.DATA
|
||||||
|
else:
|
||||||
|
st = state.DEF
|
||||||
|
|
||||||
|
fname = '%d-%s-%s-%s-%s' % (
|
||||||
|
st.value,
|
||||||
|
m.group('type'),
|
||||||
|
m.group('schema'),
|
||||||
|
m.group('name'),
|
||||||
|
m.group('owner')
|
||||||
|
)
|
||||||
|
|
||||||
|
if 255 < len(fname):
|
||||||
|
fname = fname[:255-3] + "..."
|
||||||
|
|
||||||
|
self.flushto(st, fname, line)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def sort_datalines(lines, fsorted, fsorted_args):
|
||||||
|
pre = []
|
||||||
|
data = []
|
||||||
|
post = []
|
||||||
|
|
||||||
|
state = 0
|
||||||
|
ptr = pre
|
||||||
|
isins = False
|
||||||
|
for line in lines:
|
||||||
|
if 0 == state:
|
||||||
|
if line.startswith('COPY'):
|
||||||
|
ptr.append(line)
|
||||||
|
ptr = data
|
||||||
|
state = 1
|
||||||
|
elif line.startswith('INSERT'):
|
||||||
|
ptr = data
|
||||||
|
ptr.append(line)
|
||||||
|
isins = True
|
||||||
|
state = 1
|
||||||
|
else:
|
||||||
|
ptr.append(line)
|
||||||
|
elif 1 == state:
|
||||||
|
if isins and '\n' == line or not isins and '\\.\n' == line:
|
||||||
|
ptr = post
|
||||||
|
ptr.append(line)
|
||||||
|
status = 2
|
||||||
|
else:
|
||||||
|
ptr.append(line)
|
||||||
|
else:
|
||||||
|
ptr.append(line)
|
||||||
|
|
||||||
|
return pre + fsorted(data, **fsorted_args) + post
|
||||||
|
|
||||||
|
|
||||||
|
def dissect(dump, destdir, fsorted, fsorted_args):
|
||||||
|
buf = buffer(destdir, fsorted, fsorted_args)
|
||||||
|
|
||||||
|
for line in open(dump):
|
||||||
|
# trim trailing newline (if any)
|
||||||
|
if '\n' == line[-1]:
|
||||||
|
line = line[:-1]
|
||||||
|
|
||||||
|
#print(buf.st.name.ljust(10), "\t[%s]" % line)
|
||||||
|
if buf.st == state.EMPTY:
|
||||||
|
if buf.proc_comment(line):
|
||||||
|
pass
|
||||||
|
elif '' == line:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
buf.flushto(state.SETTINGS, "%d-%s" % (state.SETTINGS.value, "SETTINGS"),
|
||||||
|
'-- Sorted PostgreSQL database dump')
|
||||||
|
buf.append(line)
|
||||||
|
|
||||||
|
elif buf.st in (state.SETTINGS, state.DEF, state.INSERT):
|
||||||
|
if buf.proc_comment(line):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
buf.append(line)
|
||||||
|
|
||||||
|
elif buf.st == state.DATA:
|
||||||
|
if line.startswith('COPY '):
|
||||||
|
buf.st = state.COPY
|
||||||
|
elif line.startswith('INSERT '):
|
||||||
|
buf.st = state.INSERT
|
||||||
|
buf.append(line)
|
||||||
|
|
||||||
|
elif buf.st == state.COPY:
|
||||||
|
buf.append(line)
|
||||||
|
if r'\.' == line:
|
||||||
|
buf.flushto(state.EMPTY, None, None)
|
||||||
|
|
||||||
|
elif buf.st == state.SEQSET:
|
||||||
|
if buf.proc_comment(line):
|
||||||
|
pass
|
||||||
|
elif line.startswith('SELECT pg_catalog.setval'):
|
||||||
|
m = re.match(RE_SEQSET, line)
|
||||||
|
line = "SELECT pg_catalog.setval('%s', 1, false);" % m.group('name')
|
||||||
|
buf.append(line)
|
||||||
|
else:
|
||||||
|
buf.append(line)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("This should not happen")
|
||||||
|
|
||||||
|
buf.flushto(state.EMPTY, None, None)
|
||||||
|
|
||||||
|
|
||||||
|
def recombine(destdir, dump, fsorted, fsorted_args):
|
||||||
|
out = open(dump, 'w')
|
||||||
|
|
||||||
|
first = True
|
||||||
|
sorted_files = fsorted(os.listdir(destdir), **fsorted_args)
|
||||||
|
for fname in sorted_files:
|
||||||
|
if first:
|
||||||
|
first = False
|
||||||
|
else:
|
||||||
|
out.write('\n')
|
||||||
|
with open(os.path.join(destdir, fname)) as f:
|
||||||
|
out.writelines(f.readlines())
|
||||||
|
|
||||||
|
if sorted_files:
|
||||||
|
out.writelines([
|
||||||
|
'\n',
|
||||||
|
'--\n',
|
||||||
|
'-- Sorted dump complete\n',
|
||||||
|
'--\n',
|
||||||
|
])
|
||||||
|
|
||||||
|
out.close()
|
||||||
|
|
||||||
|
|
||||||
|
def pgdump_sort(dump, sdump, fsorted=sorted, **fsorted_args):
|
||||||
|
destdir = tempfile.mkdtemp(suffix=os.path.basename(dump), prefix='pgdump-sort')
|
||||||
|
|
||||||
|
try:
|
||||||
|
dissect(dump, destdir, fsorted, fsorted_args)
|
||||||
|
recombine(destdir, sdump, fsorted, fsorted_args)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(destdir)
|
||||||
|
|
||||||
|
natsort_error = \
|
||||||
|
"""In order to use natural sort you need to install natsort module:
|
||||||
|
pip install natsort
|
||||||
|
"""
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = docopt(__doc__, version=version)
|
||||||
|
|
||||||
|
dump = args['<dump>']
|
||||||
|
sdump = args['<sorted-dump>']
|
||||||
|
if sdump is None:
|
||||||
|
sdump = re.sub(r'\.sql$', '', dump) + '-sorted.sql'
|
||||||
|
|
||||||
|
if args['-n']:
|
||||||
|
if _has_natsort:
|
||||||
|
fsorted = natsort.natsorted
|
||||||
|
fsorted_args = {'alg': natsort.ns.IGNORECASE}
|
||||||
|
else:
|
||||||
|
print(natsort_error, file=sys.stderr)
|
||||||
|
exit(1)
|
||||||
|
else:
|
||||||
|
fsorted = sorted
|
||||||
|
fsorted_args = {}
|
||||||
|
|
||||||
|
|
||||||
|
pgdump_sort(dump, sdump, fsorted, **fsorted_args)
|
||||||
|
|
||||||
Reference in New Issue
Block a user