bzip2 -c file.json | python3 rs_encode.py [nsym]
# nsym: even, 2-120, default 32. Use 64 for ~5% OCR error tolerance.
# Output uses Base45 alphabet (0-9, A-Z, space, $%*+-./: only).
32 → ~2% OCR errors, ~1.7× overhead
64 → ~5% OCR errors, ~2.0× overhead
96 → ~7% OCR errors, ~2.5× overhead
#!/usr/bin/env python3
"""
rs_encode.py — Reed-Solomon encode stdin, output Base45 to stdout.
Usage: bzip2 -c file.json | python rs_encode.py [nsym]
nsym: check symbols per 255-byte block (default 32, corrects up to 16 byte errors)
Output alphabet: 0-9, A-Z, space, $%*+-./: (Base45, RFC 9285)
"""
import sys, struct
# ── Base45 (RFC 9285) ────────────────────────────────────────────────────
# Alphabet: 0-9, A-Z, space, $%*+-./: (45 chars, OCR-safe)
B45_ALPHA = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ $%*+-./:'
assert len(B45_ALPHA) == 45
def b45_encode(data: bytes) -> str:
out = []
for i in range(0, len(data) - 1, 2):
n = data[i] * 256 + data[i + 1]
c, n = divmod(n, 45 * 45)
b, a = divmod(n, 45)
out.append(B45_ALPHA[a])
out.append(B45_ALPHA[b])
out.append(B45_ALPHA[c])
if len(data) % 2: # odd trailing byte
b, a = divmod(data[-1], 45)
out.append(B45_ALPHA[a])
out.append(B45_ALPHA[b])
return ''.join(out)
# ── GF(2^8) ────────────────────────────────────────────────────────────
# Primitive polynomial: x^8 + x^4 + x^3 + x^2 + 1 = 0x11d
_EXP = [0] * 512
_LOG = [0] * 256
def _init_gf():
x = 1
for i in range(255):
_EXP[i] = x
_LOG[x] = i
x <<= 1
if x & 0x100:
x ^= 0x11d
for i in range(255, 512):
_EXP[i] = _EXP[i - 255]
_init_gf()
def gmul(a, b):
return 0 if (a == 0 or b == 0) else _EXP[_LOG[a] + _LOG[b]]
def gdiv(a, b):
if b == 0: raise ZeroDivisionError
return 0 if a == 0 else _EXP[(_LOG[a] - _LOG[b]) % 255]
def gpow(x, n):
return _EXP[(_LOG[x] * n) % 255] if x else 0
def ginv(x):
return _EXP[255 - _LOG[x]]
# ── Polynomials over GF(2^8) — HIGH-DEGREE FIRST ────────────────────────
def padd(p, q):
r = bytearray(max(len(p), len(q)))
for i, v in enumerate(p): r[i + len(r) - len(p)] ^= v
for i, v in enumerate(q): r[i + len(r) - len(q)] ^= v
return r
def pscale(p, x):
return bytearray(gmul(c, x) for c in p)
def pmul(p, q):
r = bytearray(len(p) + len(q) - 1)
for i, a in enumerate(p):
if a:
for j, b in enumerate(q):
r[i + j] ^= gmul(a, b)
return r
def peval(p, x):
y = p[0]
for c in p[1:]:
y = gmul(y, x) ^ c
return y
def pdiv(dividend, divisor):
"""Returns (quotient, remainder)."""
out = bytearray(dividend)
for i in range(len(dividend) - len(divisor) + 1):
c = out[i]
if c:
for j in range(1, len(divisor)):
out[i + j] ^= gmul(divisor[j], c)
cut = len(divisor) - 1
return bytes(out[:-cut]), bytes(out[-cut:])
# ── RS generator polynomial ──────────────────────────────────────────────
_GEN_CACHE = {}
def rs_generator(nsym):
if nsym not in _GEN_CACHE:
g = bytearray([1])
for i in range(nsym):
g = pmul(g, bytearray([1, gpow(2, i)]))
_GEN_CACHE[nsym] = g
return _GEN_CACHE[nsym]
# ── RS encode one block ──────────────────────────────────────────────────
def rs_encode_block(data, nsym):
"""data: bytes, len <= 255-nsym. Returns bytearray of len(data)+nsym."""
gen = rs_generator(nsym)
msg = bytearray(data) + bytearray(nsym)
for i in range(len(data)):
c = msg[i]
if c:
for j in range(1, len(gen)):
msg[i + j] ^= gmul(gen[j], c)
return bytearray(data) + msg[len(data):]
# ── Top-level encode ─────────────────────────────────────────────────────
MAGIC = b'\xd3\x52\x53\xec'
DEFAULT_NSYM = 32
def encode(raw, nsym=DEFAULT_NSYM):
block_data = 255 - nsym
payload = MAGIC + bytes([nsym]) + struct.pack('>I', len(raw)) + raw
out = bytearray()
for i in range(0, len(payload), block_data):
chunk = payload[i : i + block_data]
out += rs_encode_block(chunk, nsym)
return bytes(out)
def main():
nsym = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_NSYM
if not (2 <= nsym <= 120 and nsym % 2 == 0):
sys.exit('nsym must be even, 2-120')
raw = sys.stdin.read().encode()
encoded = encode(raw, nsym)
b45 = b45_encode(encoded)
sys.stdout.write(b45 + '\n')
ratio = len(b45) / max(len(raw), 1)
sys.stderr.write(
f'raw={len(raw)}B payload={len(raw)+9}B '
f'RS-encoded={len(encoded)}B base45={len(b45)}chars '
f'ratio={ratio:.2f}x nsym={nsym} (corrects up to {nsym//2} byte-errors/block)\n'
)
if __name__ == '__main__':
main()
#!/usr/bin/env perl
# rs_encode.pl — Reed-Solomon encode stdin, output base64 to stdout.
# Usage: bzip2 -c file.json | perl rs_encode.pl [nsym] | tr -d '\n'
use strict;
use warnings;
use MIME::Base64;
# -- GF(2^8) --
my @EXP = (0) x 512;
my @LOG = (0) x 256;
{
my $x = 1;
for my $i (0..254) {
$EXP[$i] = $x;
$LOG[$x] = $i;
$x <<= 1;
$x ^= 0x11d if $x & 0x100;
}
$EXP[$_] = $EXP[$_ - 255] for 255..511;
}
sub gmul {
my ($a, $b) = @_;
return 0 if $a == 0 || $b == 0;
return $EXP[$LOG[$a] + $LOG[$b]];
}
sub gpow {
my ($x, $n) = @_;
return 0 unless $x;
return $EXP[($LOG[$x] * $n) % 255];
}
# -- Polynomials over GF(2^8) — HIGH-DEGREE FIRST --
sub pmul {
my ($p, $q) = @_;
my @r = (0) x (@$p + @$q - 1);
for my $i (0..$#$p) {
next unless $p->[$i];
for my $j (0..$#$q) {
$r[$i + $j] ^= gmul($p->[$i], $q->[$j]);
}
}
return \@r;
}
# -- RS generator polynomial --
my %gen_cache;
sub rs_generator {
my ($nsym) = @_;
unless (exists $gen_cache{$nsym}) {
my $g = [1];
$g = pmul($g, [1, gpow(2, $_)]) for 0..$nsym-1;
$gen_cache{$nsym} = $g;
}
return $gen_cache{$nsym};
}
# -- RS encode one block --
sub rs_encode_block {
my ($data, $nsym) = @_; # $data: arrayref of ints
my $gen = rs_generator($nsym);
my @msg = (@$data, (0) x $nsym);
for my $i (0..$#$data) {
my $c = $msg[$i];
next unless $c;
$msg[$i + $_] ^= gmul($gen->[$_], $c) for 1..$#$gen;
}
return [ @$data, @msg[scalar(@$data)..$#msg] ];
}
# -- Top-level encode --
my $MAGIC = "\xd3\x52\x53\xec";
my $DEFAULT_NSYM = 32;
sub encode {
my ($raw, $nsym) = @_;
my $block_data = 255 - $nsym;
my $payload = $MAGIC . chr($nsym) . pack('N', length($raw)) . $raw;
my @bytes = unpack('C*', $payload);
my @out;
for (my $i = 0; $i < @bytes; $i += $block_data) {
my $end = $i + $block_data - 1;
$end = $#bytes if $end > $#bytes;
my @chunk = @bytes[$i..$end];
push @out, @{ rs_encode_block(\@chunk, $nsym) };
}
return pack('C*', @out);
}
sub main {
my $nsym = @ARGV ? int($ARGV[0]) : $DEFAULT_NSYM;
die "nsym must be even, 2-120\n" unless $nsym >= 2 && $nsym <= 120 && $nsym % 2 == 0;
binmode STDIN;
local $/;
my $raw = <STDIN>;
my $encoded = encode($raw, $nsym);
my $b64 = encode_base64($encoded, '');
print $b64, "\n";
printf STDERR
"raw=%dB payload=%dB RS-encoded=%dB base64=%dchars ratio=%.2fx nsym=%d (corrects up to %d byte-errors/block)\n",
length($raw), length($raw) + 9, length($encoded), length($b64),
length($b64) / (length($raw) || 1), $nsym, $nsym / 2;
}
main();