rs_encode.py — Reed-Solomon encoder

Usage

bzip2 -c file.json | python3 rs_encode.py [nsym]
# nsym: even, 2-120, default 32. Use 64 for ~5% OCR error tolerance.
# Output uses Base45 alphabet (0-9, A-Z, space, $%*+-./: only).

nsym guide

32 → ~2% OCR errors, ~1.7× overhead

64 → ~5% OCR errors, ~2.0× overhead

96 → ~7% OCR errors, ~2.5× overhead

Code

#!/usr/bin/env python3
"""
rs_encode.py — Reed-Solomon encode stdin, output Base45 to stdout.
Usage: bzip2 -c file.json | python rs_encode.py [nsym]
  nsym: check symbols per 255-byte block (default 32, corrects up to 16 byte errors)
  Output alphabet: 0-9, A-Z, space, $%*+-./:  (Base45, RFC 9285)
"""
import sys, struct

# ── Base45 (RFC 9285) ────────────────────────────────────────────────────
# Alphabet: 0-9, A-Z, space, $%*+-./:  (45 chars, OCR-safe)

B45_ALPHA = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ $%*+-./:'
assert len(B45_ALPHA) == 45

def b45_encode(data: bytes) -> str:
    out = []
    for i in range(0, len(data) - 1, 2):
        n = data[i] * 256 + data[i + 1]
        c, n = divmod(n, 45 * 45)
        b, a = divmod(n, 45)
        out.append(B45_ALPHA[a])
        out.append(B45_ALPHA[b])
        out.append(B45_ALPHA[c])
    if len(data) % 2:                      # odd trailing byte
        b, a = divmod(data[-1], 45)
        out.append(B45_ALPHA[a])
        out.append(B45_ALPHA[b])
    return ''.join(out)

# ── GF(2^8) ────────────────────────────────────────────────────────────
# Primitive polynomial: x^8 + x^4 + x^3 + x^2 + 1 = 0x11d

_EXP = [0] * 512
_LOG = [0] * 256

def _init_gf():
    x = 1
    for i in range(255):
        _EXP[i] = x
        _LOG[x] = i
        x <<= 1
        if x & 0x100:
            x ^= 0x11d
    for i in range(255, 512):
        _EXP[i] = _EXP[i - 255]

_init_gf()

def gmul(a, b):
    return 0 if (a == 0 or b == 0) else _EXP[_LOG[a] + _LOG[b]]

def gdiv(a, b):
    if b == 0: raise ZeroDivisionError
    return 0 if a == 0 else _EXP[(_LOG[a] - _LOG[b]) % 255]

def gpow(x, n):
    return _EXP[(_LOG[x] * n) % 255] if x else 0

def ginv(x):
    return _EXP[255 - _LOG[x]]

# ── Polynomials over GF(2^8) — HIGH-DEGREE FIRST ────────────────────────

def padd(p, q):
    r = bytearray(max(len(p), len(q)))
    for i, v in enumerate(p): r[i + len(r) - len(p)] ^= v
    for i, v in enumerate(q): r[i + len(r) - len(q)] ^= v
    return r

def pscale(p, x):
    return bytearray(gmul(c, x) for c in p)

def pmul(p, q):
    r = bytearray(len(p) + len(q) - 1)
    for i, a in enumerate(p):
        if a:
            for j, b in enumerate(q):
                r[i + j] ^= gmul(a, b)
    return r

def peval(p, x):
    y = p[0]
    for c in p[1:]:
        y = gmul(y, x) ^ c
    return y

def pdiv(dividend, divisor):
    """Returns (quotient, remainder)."""
    out = bytearray(dividend)
    for i in range(len(dividend) - len(divisor) + 1):
        c = out[i]
        if c:
            for j in range(1, len(divisor)):
                out[i + j] ^= gmul(divisor[j], c)
    cut = len(divisor) - 1
    return bytes(out[:-cut]), bytes(out[-cut:])

# ── RS generator polynomial ──────────────────────────────────────────────

_GEN_CACHE = {}

def rs_generator(nsym):
    if nsym not in _GEN_CACHE:
        g = bytearray([1])
        for i in range(nsym):
            g = pmul(g, bytearray([1, gpow(2, i)]))
        _GEN_CACHE[nsym] = g
    return _GEN_CACHE[nsym]

# ── RS encode one block ──────────────────────────────────────────────────

def rs_encode_block(data, nsym):
    """data: bytes, len <= 255-nsym.  Returns bytearray of len(data)+nsym."""
    gen = rs_generator(nsym)
    msg = bytearray(data) + bytearray(nsym)
    for i in range(len(data)):
        c = msg[i]
        if c:
            for j in range(1, len(gen)):
                msg[i + j] ^= gmul(gen[j], c)
    return bytearray(data) + msg[len(data):]

# ── Top-level encode ─────────────────────────────────────────────────────

MAGIC = b'\xd3\x52\x53\xec'
DEFAULT_NSYM = 32

def encode(raw, nsym=DEFAULT_NSYM):
    block_data = 255 - nsym
    payload = MAGIC + bytes([nsym]) + struct.pack('>I', len(raw)) + raw
    out = bytearray()
    for i in range(0, len(payload), block_data):
        chunk = payload[i : i + block_data]
        out += rs_encode_block(chunk, nsym)
    return bytes(out)

def main():
    nsym = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_NSYM
    if not (2 <= nsym <= 120 and nsym % 2 == 0):
        sys.exit('nsym must be even, 2-120')
    raw = sys.stdin.read().encode()
    encoded = encode(raw, nsym)
    b45 = b45_encode(encoded)
    sys.stdout.write(b45 + '\n')
    ratio = len(b45) / max(len(raw), 1)
    sys.stderr.write(
        f'raw={len(raw)}B  payload={len(raw)+9}B  '
        f'RS-encoded={len(encoded)}B  base45={len(b45)}chars  '
        f'ratio={ratio:.2f}x  nsym={nsym} (corrects up to {nsym//2} byte-errors/block)\n'
    )

if __name__ == '__main__':
    main()

Perl port (rs_encode.pl)

#!/usr/bin/env perl
# rs_encode.pl — Reed-Solomon encode stdin, output base64 to stdout.
# Usage: bzip2 -c file.json | perl rs_encode.pl [nsym] | tr -d '\n'

use strict;
use warnings;
use MIME::Base64;

# -- GF(2^8) --

my @EXP = (0) x 512;
my @LOG = (0) x 256;

{
    my $x = 1;
    for my $i (0..254) {
        $EXP[$i] = $x;
        $LOG[$x] = $i;
        $x <<= 1;
        $x ^= 0x11d if $x & 0x100;
    }
    $EXP[$_] = $EXP[$_ - 255] for 255..511;
}

sub gmul {
    my ($a, $b) = @_;
    return 0 if $a == 0 || $b == 0;
    return $EXP[$LOG[$a] + $LOG[$b]];
}

sub gpow {
    my ($x, $n) = @_;
    return 0 unless $x;
    return $EXP[($LOG[$x] * $n) % 255];
}

# -- Polynomials over GF(2^8) — HIGH-DEGREE FIRST --

sub pmul {
    my ($p, $q) = @_;
    my @r = (0) x (@$p + @$q - 1);
    for my $i (0..$#$p) {
        next unless $p->[$i];
        for my $j (0..$#$q) {
            $r[$i + $j] ^= gmul($p->[$i], $q->[$j]);
        }
    }
    return \@r;
}

# -- RS generator polynomial --

my %gen_cache;

sub rs_generator {
    my ($nsym) = @_;
    unless (exists $gen_cache{$nsym}) {
        my $g = [1];
        $g = pmul($g, [1, gpow(2, $_)]) for 0..$nsym-1;
        $gen_cache{$nsym} = $g;
    }
    return $gen_cache{$nsym};
}

# -- RS encode one block --

sub rs_encode_block {
    my ($data, $nsym) = @_;    # $data: arrayref of ints
    my $gen = rs_generator($nsym);
    my @msg = (@$data, (0) x $nsym);
    for my $i (0..$#$data) {
        my $c = $msg[$i];
        next unless $c;
        $msg[$i + $_] ^= gmul($gen->[$_], $c) for 1..$#$gen;
    }
    return [ @$data, @msg[scalar(@$data)..$#msg] ];
}

# -- Top-level encode --

my $MAGIC        = "\xd3\x52\x53\xec";
my $DEFAULT_NSYM = 32;

sub encode {
    my ($raw, $nsym) = @_;
    my $block_data = 255 - $nsym;
    my $payload    = $MAGIC . chr($nsym) . pack('N', length($raw)) . $raw;
    my @bytes      = unpack('C*', $payload);
    my @out;
    for (my $i = 0; $i < @bytes; $i += $block_data) {
        my $end   = $i + $block_data - 1;
        $end      = $#bytes if $end > $#bytes;
        my @chunk = @bytes[$i..$end];
        push @out, @{ rs_encode_block(\@chunk, $nsym) };
    }
    return pack('C*', @out);
}

sub main {
    my $nsym = @ARGV ? int($ARGV[0]) : $DEFAULT_NSYM;
    die "nsym must be even, 2-120\n" unless $nsym >= 2 && $nsym <= 120 && $nsym % 2 == 0;

    binmode STDIN;
    local $/;
    my $raw = <STDIN>;

    my $encoded = encode($raw, $nsym);
    my $b64     = encode_base64($encoded, '');
    print $b64, "\n";

    printf STDERR
        "raw=%dB  payload=%dB  RS-encoded=%dB  base64=%dchars  ratio=%.2fx  nsym=%d (corrects up to %d byte-errors/block)\n",
        length($raw), length($raw) + 9, length($encoded), length($b64),
        length($b64) / (length($raw) || 1), $nsym, $nsym / 2;
}

main();
description Encode stdin with RS error correction, output Base45 to stdout. For transmitting data via screenshot+OCR.  ·  version 4  ·  updated 2026-04-27  ·  tags ['python', 'reed-solomon', 'ocr', 'base45']