From c3832e43b9451efd8aa3c77ff0392a949aa05e22 Mon Sep 17 00:00:00 2001 From: Luca Toniolo <10792599+grandixximo@users.noreply.github.com> Date: Fri, 12 Jun 2026 19:32:02 +0800 Subject: [PATCH] docs: deduplicate built HTML images into a shared image/ tree The HTML doc build copies every referenced image into each language and topic directory, so the built tree carries the same bytes many times over (238 MB of images, only 30 MB unique). Add docs/src/tools/dedup-images.py: it collapses byte-identical images (SHA-256) into a shared root image/ tree, keeps only translated overrides under /image/, and rewrites every src and click-to-enlarge href to match. Dry-run by default, idempotent, and self-verifying: after --apply it re-resolves every reference and fails if any is broken. Wire it into the htmldocs build as a final .dedup-images-stamp step. The tool preserves the mtime of every HTML file it rewrites, so a second `make htmldocs` does no work. On the full translated tree this takes the build output from 325 MB to 119 MB (images 238 MB to 30 MB) with all references verified. --- docs/src/Submakefile | 12 +- docs/src/tools/dedup-images.py | 377 +++++++++++++++++++++++++++++++++ 2 files changed, 387 insertions(+), 2 deletions(-) create mode 100644 docs/src/tools/dedup-images.py diff --git a/docs/src/Submakefile b/docs/src/Submakefile index d3102f8ecd5..c221572f6df 100644 --- a/docs/src/Submakefile +++ b/docs/src/Submakefile @@ -573,7 +573,7 @@ $(DOC_OUT_HTML)/pdf/index.html: $(PDF_TARGETS) ../scripts/make-docs-pdf-index @mkdir -p $(dir $@) $(Q)../scripts/make-docs-pdf-index -htmldocs: svgs_made_from_dots .htmldoc-stamp checkref_en +htmldocs: svgs_made_from_dots .htmldoc-stamp .dedup-images-stamp checkref_en # When translations are enabled, the .adoc files in $(L)/ are produced by # the translateddocs target (po4a). Teach make how to ask for them: the @@ -595,6 +595,13 @@ endif .htmldoc-stamp: .copy-asciidoc-stamp $(DOC_DIR)/.gen_complist-stamp $(HTML_TARGETS) .images-stamp .include-stamp $(DOC_OUT_HTML)/asciidoctor.css $(DOC_OUT_HTML)/rouge-github.css .lang-switcher-stamp touch $@ +# Collapse byte-identical images into a shared image/ tree and rewrite refs. +# Runs last; the tool preserves HTML mtimes so a second `make htmldocs` is a +# no-op. +.dedup-images-stamp: $(DOC_SRCDIR)/tools/dedup-images.py .htmldoc-stamp + $(Q)python3 $(DOC_SRCDIR)/tools/dedup-images.py --html-root $(DOC_OUT_HTML) --apply + @touch $@ + # Inject the whole-document sidebar/topbar and grey out missing language- # switcher entries. Runs last (depends on every HTML target) and is # idempotent. Gated on BUILD_DOCS_HTML, not translations: the sidebar comes @@ -705,7 +712,7 @@ checkref_en: $(DOC_DIR)/.checkref-english-stamp # (w3c-linkchecker disables file:// URIs), so the tree may carry accumulated # broken links. Report them without breaking the build for now; drop this # flag once the backlog is cleared so regressions fail the build again. -$(DOC_DIR)/.checkref-english-stamp: $(DOC_TARGETS_HTML_EN) $(DOC_OUT_HTML)/en/index.html $(DOC_OUT_HTML)/en/gcode.html .htmldoc-stamp +$(DOC_DIR)/.checkref-english-stamp: $(DOC_TARGETS_HTML_EN) $(DOC_OUT_HTML)/en/index.html $(DOC_OUT_HTML)/en/gcode.html .htmldoc-stamp .dedup-images-stamp $(DOC_SRCDIR)/checkref --warn-on-failure English $(filter %.html,$^) @touch $@ @@ -1313,6 +1320,7 @@ docclean: -rm -f $(DOC_SRCDIR)/*/*.html -rm -rf $(DOC_FONT_DIR) -rm -f .htmldoc-stamp + -rm -f .dedup-images-stamp -rm -f .copy-asciidoc-stamp -rm -f .adoc-images-stamp -rm -f .html-images-stamp diff --git a/docs/src/tools/dedup-images.py b/docs/src/tools/dedup-images.py new file mode 100644 index 00000000000..d7be5e83858 --- /dev/null +++ b/docs/src/tools/dedup-images.py @@ -0,0 +1,377 @@ +#!/usr/bin/env python3 +# dedup-images.py -- deduplicate images in the built LinuxCNC HTML docs tree. +# +# The build copies every referenced image into every language tree (and every +# topic that references a shared image), storing the same bytes many times. +# This rewrites the tree to: +# html/image/ generic images, one copy per unique content +# html//image/ only the images a language actually translates +# and rewrites every and click-to-enlarge to match. Images +# are matched by SHA-256, so byte-identical copies collapse to a single file. +# +# Dry-run by default; --apply rewrites in place; --check only verifies refs. +# After --apply it re-resolves every reference and fails if any is broken. It +# is idempotent and touches only the output tree, so it can run post-build. + +import argparse +import hashlib +import os +import re +import sys + +IMAGE_EXT = ('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp') +# Refs the build leaves alone: external URLs, data URIs, absolute paths, logo. +SKIP_REF = re.compile(r'^(https?:|data:|/|#)|lcnc-docs\.svg', re.IGNORECASE) +REF_RE = re.compile(r'(?P\b(?:src|href))="(?P[^"]+)"', re.IGNORECASE) + +DEFAULT_CANONICAL_LANG = 'en' +DEFAULT_IMAGE_DIR = 'image' + + +def log(msg): + sys.stderr.write(msg + '\n') + + +def human(n): + for unit in ('B', 'KiB', 'MiB', 'GiB'): + if n < 1024 or unit == 'GiB': + return ('%.1f %s' % (n, unit)) if unit != 'B' else ('%d B' % n) + n /= 1024.0 + + +def sha256_file(path): + h = hashlib.sha256() + with open(path, 'rb') as f: + for chunk in iter(lambda: f.read(1 << 16), b''): + h.update(chunk) + return h.hexdigest() + + +def is_image_ref(val): + """True if a src/href value is an in-tree image we should manage.""" + if SKIP_REF.search(val): + return False + base = val.split('#', 1)[0].split('?', 1)[0] + return base.lower().endswith(IMAGE_EXT) + + +def discover_langs(html_root, image_dir): + langs = [] + for name in sorted(os.listdir(html_root)): + p = os.path.join(html_root, name) + if os.path.isdir(p) and name != image_dir: + langs.append(name) + return langs + + +def rel_posix(path, start): + return os.path.relpath(path, start).replace(os.sep, '/') + + +class Plan: + """Computed relocation plan for the whole tree.""" + + def __init__(self, html_root, langs, image_dir, canonical_lang): + self.html_root = html_root + self.langs = langs + self.image_dir = image_dir + self.canonical_lang = canonical_lang + self.inventory = {} # inventory[lang][P] = (abspath, sha, size); P lang-relative posix + self.generic_hash = {} # generic_hash[P] = sha of the generic (en or majority) content + self.canonical_path = {} # canonical_path[sha] = posix path under html_root + self.dest_abs = {} # dest_abs[(lang, P)] = absolute destination after relocation + self.total_bytes = 0 + self.unique_bytes = 0 + + def _lang_root(self, lang): + return os.path.join(self.html_root, lang) + + def _under_image_dir(self, p_posix): + # exclude the new layout (root image/ and /image/) so re-runs are no-ops + return p_posix == self.image_dir or p_posix.startswith(self.image_dir + '/') + + def build_inventory(self): + for lang in self.langs: + root = self._lang_root(lang) + self.inventory[lang] = {} + for dirpath, dirnames, filenames in os.walk(root): + for fn in filenames: + if not fn.lower().endswith(IMAGE_EXT): + continue + ap = os.path.join(dirpath, fn) + if os.path.islink(ap): + continue + P = rel_posix(ap, root) + if self._under_image_dir(P): + continue # already-relocated image; leave it + sha = sha256_file(ap) + size = os.path.getsize(ap) + self.inventory[lang][P] = (ap, sha, size) + + def classify(self): + # All logical paths seen in any language. + all_paths = set() + for lang in self.langs: + all_paths.update(self.inventory[lang].keys()) + + # Generic content for each path: the canonical lang's bytes, else majority. + for P in all_paths: + cl = self.canonical_lang + if cl in self.inventory and P in self.inventory[cl]: + self.generic_hash[P] = self.inventory[cl][P][1] + continue + counts = {} + for lang in self.langs: + e = self.inventory[lang].get(P) + if e: + counts[e[1]] = counts.get(e[1], 0) + 1 + # majority; deterministic tie-break by hash + self.generic_hash[P] = sorted(counts.items(), + key=lambda kv: (-kv[1], kv[0]))[0][0] + + # One canonical path per generic content (keyed by hash, so identical + # bytes anywhere collapse to one file). Representative = shortest then + # lexicographically first path holding that content. + reps = {} # sha -> (depth, P) + for P in sorted(all_paths): + g = self.generic_hash[P] + has = any(self.inventory[l].get(P) and self.inventory[l][P][1] == g + for l in self.langs) + if not has: + continue + cur = reps.get(g) + key = (P.count('/'), P) + if cur is None or key < (cur[0], cur[1]): + reps[g] = (P.count('/'), P) + for sha, (_, P) in reps.items(): + self.canonical_path[sha] = self.image_dir + '/' + P + + # Per-(lang, P) destination + byte accounting. + seen_unique = set() # abspath-or-canonical keys counted once + for lang in self.langs: + for P, (ap, sha, size) in self.inventory[lang].items(): + self.total_bytes += size + if sha == self.generic_hash.get(P): + dest = os.path.join(self.html_root, + *self.canonical_path[sha].split('/')) + key = ('G', sha) + else: + # translated / language-specific image + dest = os.path.join(self._lang_root(lang), self.image_dir, + *P.split('/')) + key = ('S', lang, sha, P) + self.dest_abs[(lang, P)] = dest + if key not in seen_unique: + seen_unique.add(key) + self.unique_bytes += size + + # ---- reference resolution ------------------------------------------- + def resolve_ref(self, page_abs, lang, val): + """Map a src/href value on a page to its (P, dest_abs) or None.""" + page_dir = os.path.dirname(page_abs) + base = val.split('#', 1)[0].split('?', 1)[0] + target_old = os.path.normpath(os.path.join(page_dir, base)) + lang_root = self._lang_root(lang) + P = rel_posix(target_old, lang_root) + if P.startswith('../') or self._under_image_dir(P): + return None # outside the language tree, or already relocated + if (lang, P) not in self.dest_abs: + return None + return P, self.dest_abs[(lang, P)] + + def new_ref_value(self, page_abs, dest_abs, original_val): + page_dir = os.path.dirname(page_abs) + new = rel_posix(dest_abs, page_dir) + # preserve any #fragment/?query suffix + suffix = original_val[len(original_val.split('#', 1)[0].split('?', 1)[0]):] + return new + suffix + + +def iter_html(html_root, langs): + for lang in langs: + for dirpath, _, filenames in os.walk(os.path.join(html_root, lang)): + for fn in filenames: + if fn.lower().endswith(('.html', '.htm')): + yield lang, os.path.join(dirpath, fn) + + +def rewrite_html_file(plan, lang, page_abs, apply_changes): + with open(page_abs, 'r', encoding='utf-8', errors='surrogatepass') as f: + text = f.read() + changed = 0 + + def repl(m): + nonlocal changed + val = m.group('val') + if not is_image_ref(val): + return m.group(0) + r = plan.resolve_ref(page_abs, lang, val) + if r is None: + return m.group(0) + _, dest_abs = r + newval = plan.new_ref_value(page_abs, dest_abs, val) + if newval == val: + return m.group(0) + changed += 1 + return '%s="%s"' % (m.group('attr'), newval) + + new_text = REF_RE.sub(repl, text) + if apply_changes and changed: + st = os.stat(page_abs) + with open(page_abs, 'w', encoding='utf-8', errors='surrogatepass') as f: + f.write(new_text) + # Preserve mtime so an incremental `make` does not re-run the build. + os.utime(page_abs, (st.st_atime, st.st_mtime)) + return changed + + +def relocate_files(plan): + import shutil + placed = set() + + def source_for_generic(sha): + cl = plan.canonical_lang + for lang in [cl] + [l for l in plan.langs if l != cl]: + for P, (ap, s, _) in plan.inventory[lang].items(): + if s == sha: + return ap + return None + + for (lang, P), dest in plan.dest_abs.items(): + if dest in placed: + continue + ap, sha, _ = plan.inventory[lang][P] + if sha == plan.generic_hash.get(P): + src = source_for_generic(sha) + else: + src = ap + os.makedirs(os.path.dirname(dest), exist_ok=True) + if os.path.abspath(src) != os.path.abspath(dest): + shutil.copy2(src, dest) + placed.add(dest) + # 2) delete the originals + for lang in plan.langs: + for P, (ap, _, _) in plan.inventory[lang].items(): + if os.path.abspath(ap) in {os.path.abspath(d) for d in placed}: + continue + try: + os.remove(ap) + except FileNotFoundError: + pass + # 3) prune empty directories (bottom-up), never the new image dirs + for lang in plan.langs: + for dirpath, dirnames, filenames in os.walk(plan._lang_root(lang), + topdown=False): + if os.path.basename(dirpath) == plan.image_dir: + continue + try: + if not os.listdir(dirpath): + os.rmdir(dirpath) + except OSError: + pass + + +def verify(plan): + """Re-resolve every image reference; return list of broken (page, val).""" + broken = [] + for lang, page in iter_html(plan.html_root, plan.langs): + with open(page, 'r', encoding='utf-8', errors='surrogatepass') as f: + text = f.read() + page_dir = os.path.dirname(page) + for m in REF_RE.finditer(text): + val = m.group('val') + if not is_image_ref(val): + continue + base = val.split('#', 1)[0].split('?', 1)[0] + target = os.path.normpath(os.path.join(page_dir, base)) + if not os.path.isfile(target): + broken.append((page, val)) + return broken + + +def main(): + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument('--html-root', required=True, + help='built HTML tree root (e.g. docs/build/html)') + ap.add_argument('--apply', action='store_true', + help='rewrite the tree in place (default: dry-run report)') + ap.add_argument('--check', action='store_true', + help='only verify that every image reference resolves') + ap.add_argument('--langs', default='', + help='comma-separated language dirs (default: autodetect)') + ap.add_argument('--canonical-lang', default=DEFAULT_CANONICAL_LANG, + help='language whose images are the generic default (default: en)') + ap.add_argument('--image-dir', default=DEFAULT_IMAGE_DIR, + help='name of the shared image directory (default: image)') + ap.add_argument('-v', '--verbose', action='store_true') + args = ap.parse_args() + + html_root = os.path.abspath(args.html_root) + if not os.path.isdir(html_root): + log('error: not a directory: %s' % html_root) + return 2 + + langs = ([l for l in args.langs.split(',') if l] if args.langs + else discover_langs(html_root, args.image_dir)) + if not langs: + log('error: no language directories found under %s' % html_root) + return 2 + + if args.check: + plan = Plan(html_root, langs, args.image_dir, args.canonical_lang) + broken = verify(plan) + if broken: + log('FAIL: %d broken image reference(s):' % len(broken)) + for page, val in broken[:50]: + log(' %s -> %s' % (rel_posix(page, html_root), val)) + return 1 + log('OK: all image references resolve') + return 0 + + plan = Plan(html_root, langs, args.image_dir, args.canonical_lang) + log('Scanning %d language tree(s): %s' % (len(langs), ', '.join(langs))) + plan.build_inventory() + plan.classify() + + n_files = sum(len(plan.inventory[l]) for l in langs) + n_generic = len(plan.canonical_path) + n_specific = sum(1 for k, dest in plan.dest_abs.items() + if os.sep + args.image_dir + os.sep in dest + and not dest.startswith(os.path.join(html_root, args.image_dir) + os.sep)) + saved = plan.total_bytes - plan.unique_bytes + + log('') + log('Image files (with duplication): %d, %s' % (n_files, human(plan.total_bytes))) + log('Unique after dedup: %s' % human(plan.unique_bytes)) + log('Generic images (root image/): %d' % n_generic) + log('Language-specific (translated): %d' % n_specific) + log('Space reclaimed: %s (%.1f%%)' + % (human(saved), 100.0 * saved / plan.total_bytes if plan.total_bytes else 0)) + + # count reference rewrites + total_refs = 0 + for lang, page in iter_html(html_root, langs): + total_refs += rewrite_html_file(plan, lang, page, apply_changes=args.apply) + + if not args.apply: + log('') + log('Dry run: %d image reference(s) would be rewritten across the HTML.' % total_refs) + log('Re-run with --apply to perform the changes.') + return 0 + + log('') + log('Rewrote %d image reference(s).' % total_refs) + relocate_files(plan) + broken = verify(plan) + if broken: + log('ERROR: %d broken image reference(s) after apply:' % len(broken)) + for page, val in broken[:50]: + log(' %s -> %s' % (rel_posix(page, html_root), val)) + return 1 + log('Verification passed: all image references resolve.') + return 0 + + +if __name__ == '__main__': + sys.exit(main())