Files
backup-scripts/pgdump-sort
Vasili Sviridov 9702287a92 Workaround for pg_dump non-idempotent behaviour
Problem
=======

* `pg_dump` may dump the data rows in arbitrary order. This messes with
  the final hash of the dump file, even though the data after restoring
  would be same. This creates additional uploads with no value

Solution
========

* Utilize `pgdump-sort` script by `tigra564`, which attempts to sort
  data in the dump file specifically for hashing purposes.
2022-12-26 12:35:38 -08:00

271 lines
5.4 KiB
Python
Executable File

#!/usr/bin/python3
"""
Usage: pgdump-sort [options] <dump> [<sorted-dump>]
pgdump-sort -h | --help | --version
Options:
-n Sort entries in natural order (requires python3 module natsort)
-h --help Show this usage and exit
--version Show version and exit
"""
from docopt import docopt
import os
import sys
import re
import tempfile
import shutil
from enum import Enum
_has_natsort = False
try:
import natsort
_has_natsort = True
except ModuleNotFoundError:
pass
version='0.2'
RE_OBJDESC = re.compile(
'-- (?P<isdata>(Data for )?)Name: (?P<name>.*?); '
'Type: (?P<type>.*?); '
'Schema: (?P<schema>.*?); '
'Owner: (?P<owner>.*)'
)
RE_SEQSET = re.compile("SELECT pg_catalog.setval\('(?P<name>.*?)'.*")
class state(Enum):
EMPTY = 1
SETTINGS = 2
DEF = 3
DATA = 4
COPY = 5
INSERT = 6
SEQSET = 7
class buffer(list):
destdir = None
st = state.EMPTY
fname = None
title = None
def __init__(self, destdir, fsorted, fsorted_args):
self.destdir = destdir
self.fsorted = fsorted
self.fsorted_args = fsorted_args
def flushto(self, st, fname, title):
#print("EVICTING", self.st, "to", self.fname, "New state:", st)
# Trim ellipsing comments and empty lines
while self and ('' == self[0] or self[0].startswith('--')):
del self[0]
while self and ('' == self[-1] or self[-1].startswith('--')):
del self[-1]
if len(self):
if self.st in (state.COPY, state.INSERT):
self[:] = sort_datalines(self, self.fsorted, self.fsorted_args)
self[:] = [
'--',
self.title,
'--',
'',
] + self
with open(os.path.join(self.destdir, self.fname), "w") as out:
out.writelines([l + '\n' for l in self])
self.clear()
self.st = st
self.fname = fname
self.title = title
def proc_comment(self, line):
# Returns True if the line is a comment, i.e. it has been processed
if not line.startswith('--'):
return False
m = re.match(RE_OBJDESC, line)
if not m:
return True
if 'SEQUENCE SET' == m.group('type'):
st = state.SEQSET
elif m.group('isdata'):
st = state.DATA
else:
st = state.DEF
fname = '%d-%s-%s-%s-%s' % (
st.value,
m.group('type'),
m.group('schema'),
m.group('name'),
m.group('owner')
)
if 255 < len(fname):
fname = fname[:255-3] + "..."
self.flushto(st, fname, line)
return True
def sort_datalines(lines, fsorted, fsorted_args):
pre = []
data = []
post = []
state = 0
ptr = pre
isins = False
for line in lines:
if 0 == state:
if line.startswith('COPY'):
ptr.append(line)
ptr = data
state = 1
elif line.startswith('INSERT'):
ptr = data
ptr.append(line)
isins = True
state = 1
else:
ptr.append(line)
elif 1 == state:
if isins and '\n' == line or not isins and '\\.\n' == line:
ptr = post
ptr.append(line)
status = 2
else:
ptr.append(line)
else:
ptr.append(line)
return pre + fsorted(data, **fsorted_args) + post
def dissect(dump, destdir, fsorted, fsorted_args):
buf = buffer(destdir, fsorted, fsorted_args)
for line in open(dump):
# trim trailing newline (if any)
if '\n' == line[-1]:
line = line[:-1]
#print(buf.st.name.ljust(10), "\t[%s]" % line)
if buf.st == state.EMPTY:
if buf.proc_comment(line):
pass
elif '' == line:
pass
else:
buf.flushto(state.SETTINGS, "%d-%s" % (state.SETTINGS.value, "SETTINGS"),
'-- Sorted PostgreSQL database dump')
buf.append(line)
elif buf.st in (state.SETTINGS, state.DEF, state.INSERT):
if buf.proc_comment(line):
pass
else:
buf.append(line)
elif buf.st == state.DATA:
if line.startswith('COPY '):
buf.st = state.COPY
elif line.startswith('INSERT '):
buf.st = state.INSERT
buf.append(line)
elif buf.st == state.COPY:
buf.append(line)
if r'\.' == line:
buf.flushto(state.EMPTY, None, None)
elif buf.st == state.SEQSET:
if buf.proc_comment(line):
pass
elif line.startswith('SELECT pg_catalog.setval'):
m = re.match(RE_SEQSET, line)
line = "SELECT pg_catalog.setval('%s', 1, false);" % m.group('name')
buf.append(line)
else:
buf.append(line)
else:
print("This should not happen")
buf.flushto(state.EMPTY, None, None)
def recombine(destdir, dump, fsorted, fsorted_args):
out = open(dump, 'w')
first = True
sorted_files = fsorted(os.listdir(destdir), **fsorted_args)
for fname in sorted_files:
if first:
first = False
else:
out.write('\n')
with open(os.path.join(destdir, fname)) as f:
out.writelines(f.readlines())
if sorted_files:
out.writelines([
'\n',
'--\n',
'-- Sorted dump complete\n',
'--\n',
])
out.close()
def pgdump_sort(dump, sdump, fsorted=sorted, **fsorted_args):
destdir = tempfile.mkdtemp(suffix=os.path.basename(dump), prefix='pgdump-sort')
try:
dissect(dump, destdir, fsorted, fsorted_args)
recombine(destdir, sdump, fsorted, fsorted_args)
finally:
shutil.rmtree(destdir)
natsort_error = \
"""In order to use natural sort you need to install natsort module:
pip install natsort
"""
if __name__ == '__main__':
args = docopt(__doc__, version=version)
dump = args['<dump>']
sdump = args['<sorted-dump>']
if sdump is None:
sdump = re.sub(r'\.sql$', '', dump) + '-sorted.sql'
if args['-n']:
if _has_natsort:
fsorted = natsort.natsorted
fsorted_args = {'alg': natsort.ns.IGNORECASE}
else:
print(natsort_error, file=sys.stderr)
exit(1)
else:
fsorted = sorted
fsorted_args = {}
pgdump_sort(dump, sdump, fsorted, **fsorted_args)