Extract and insert 97 images from reference PDF into songs

Extract images from the CL6 PDF using pdfimages, map them to songs via page-to-CL number matching, and insert \songimage commands. Add insert-images.py script for repeatable extraction. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 19:03:07 +02:00
parent c202f1a792
commit 0e8660cd41
274 changed files with 471 additions and 0 deletions
--- a/insert-images.py
+++ b/insert-images.py
@@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+"""
+Insert \\songimage{} commands into song .tex files based on the image-to-song
+mapping extracted from the reference PDF.
+
+Steps:
+  1. Run `pdfimages -list` to get image number -> PDF page mapping
+  2. Run `pdftotext -layout` to get PDF page -> CL number mapping
+  3. Read song .tex files to get CL number -> song filename mapping
+  4. For each image that maps to a song, add \\songimage{images/img-NNN.ext}
+
+Idempotent: skips songs that already have \\songimage or \\fillerpage commands.
+"""
+
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+from collections import defaultdict
+
+# --- Configuration ---
+PROJECT_DIR = Path("/home/shahondin1624/Projects/songbook")
+SONGS_DIR = PROJECT_DIR / "songs"
+IMAGES_DIR = PROJECT_DIR / "images"
+PDF_PATH = Path("/home/shahondin1624/Documents/Pfadfinder/Liederbuch/280824_Liederbuch_CL6_2025.pdf")
+MIN_FILE_SIZE = 5 * 1024  # 5KB minimum
+
+
+def get_image_page_mapping():
+    """Run pdfimages -list and return {image_num: (page, type, size_str)}."""
+    result = subprocess.run(
+        ["pdfimages", "-list", str(PDF_PATH)],
+        capture_output=True, text=True, check=True
+    )
+    images = {}
+    for line in result.stdout.strip().split("\n"):
+        # Skip header lines
+        if line.startswith("page") or line.startswith("---"):
+            continue
+        parts = line.split()
+        if len(parts) < 14:
+            continue
+        page = int(parts[0])
+        img_num = int(parts[1])
+        img_type = parts[2]  # "image", "smask"
+        size_str = parts[12]  # e.g. "333K", "9107B"
+        images[img_num] = (page, img_type, size_str)
+    return images
+
+
+def parse_size(size_str):
+    """Parse a size string like '333K' or '9107B' into bytes."""
+    size_str = size_str.strip()
+    if size_str.endswith("K"):
+        return int(float(size_str[:-1]) * 1024)
+    elif size_str.endswith("M"):
+        return int(float(size_str[:-1]) * 1024 * 1024)
+    elif size_str.endswith("B"):
+        return int(float(size_str[:-1]))
+    else:
+        return int(float(size_str))
+
+
+def get_page_to_cl_mapping():
+    """Run pdftotext -layout and extract page -> CL number mapping."""
+    result = subprocess.run(
+        ["pdftotext", "-layout", str(PDF_PATH), "-"],
+        capture_output=True, text=True, check=True
+    )
+    pages = result.stdout.split("\f")
+    page_to_cl = {}
+
+    for i in range(len(pages)):
+        lines = pages[i].strip().split("\n")
+        if len(lines) < 2:
+            continue
+
+        header_line = lines[-2]
+        number_line = lines[-1]
+
+        if "CL" not in header_line:
+            continue
+
+        cl_pos = header_line.index("CL")
+
+        # Find the number closest to the CL column position
+        best_num = None
+        best_dist = float("inf")
+        for m in re.finditer(r"\d+", number_line):
+            dist = abs(m.start() - cl_pos)
+            if dist < 8 and dist < best_dist:
+                best_dist = dist
+                best_num = int(m.group())
+
+        if best_num is not None:
+            page_to_cl[i + 1] = best_num
+
+    return page_to_cl
+
+
+def get_cl_to_song_mapping():
+    """Read song .tex files and return {cl_number: song_filepath}."""
+    cl_to_song = {}
+    for tex_file in sorted(SONGS_DIR.glob("*.tex")):
+        content = tex_file.read_text(encoding="utf-8")
+        match = re.search(r"cl\s*=\s*(\d+)", content)
+        if match:
+            cl_num = int(match.group(1))
+            cl_to_song[cl_num] = tex_file
+    return cl_to_song
+
+
+def get_image_file(img_num):
+    """Find the actual image file for a given image number. Returns (path, ext) or None."""
+    for ext in ["jpg", "png"]:
+        path = IMAGES_DIR / f"img-{img_num:03d}.{ext}"
+        if path.exists():
+            return path, ext
+    return None, None
+
+
+def song_already_has_image(content):
+    """Check if the song already has a \\songimage or \\fillerpage command."""
+    return r"\songimage" in content or r"\fillerpage" in content
+
+
+def insert_songimage(filepath, image_path_rel):
+    """Insert \\songimage{path} into a song file, after last \\end{verse} but before \\end{song}."""
+    content = filepath.read_text(encoding="utf-8")
+
+    if song_already_has_image(content):
+        return False
+
+    # Find the position of \end{song}
+    end_song_match = re.search(r"\\end\{song\}", content)
+    if not end_song_match:
+        print(f"  WARNING: No \\end{{song}} found in {filepath.name}")
+        return False
+
+    end_song_pos = end_song_match.start()
+
+    # Insert \songimage before \end{song}
+    # Ensure there's a blank line before it for readability
+    insert_text = f"\\songimage{{{image_path_rel}}}\n\n"
+
+    # Check if there's already a blank line before \end{song}
+    before = content[:end_song_pos]
+    if before.endswith("\n\n"):
+        new_content = before + insert_text + content[end_song_pos:]
+    elif before.endswith("\n"):
+        new_content = before + "\n" + insert_text + content[end_song_pos:]
+    else:
+        new_content = before + "\n\n" + insert_text + content[end_song_pos:]
+
+    filepath.write_text(new_content, encoding="utf-8")
+    return True
+
+
+def main():
+    print("=" * 60)
+    print("Image-to-Song Insertion Script")
+    print("=" * 60)
+
+    # Step 1: Get image -> page mapping from PDF
+    print("\n[1] Running pdfimages -list...")
+    image_info = get_image_page_mapping()
+    print(f"    Found {len(image_info)} images in PDF")
+
+    # Step 2: Get page -> CL mapping from PDF text
+    print("\n[2] Running pdftotext -layout...")
+    page_to_cl = get_page_to_cl_mapping()
+    print(f"    Found CL numbers for {len(page_to_cl)} pages")
+
+    # Step 3: Get CL -> song file mapping
+    print("\n[3] Reading song .tex files...")
+    cl_to_song = get_cl_to_song_mapping()
+    print(f"    Found {len(cl_to_song)} songs with CL numbers")
+
+    # Step 4: Build image -> CL mapping, filtering out smasks and small images
+    print("\n[4] Building image-to-song mapping...")
+    skipped_smask = 0
+    skipped_small = 0
+    skipped_no_file = 0
+    skipped_no_cl = 0
+    skipped_no_song = 0
+
+    # Collect: CL number -> list of (image_num, file_path, file_size)
+    cl_to_images = defaultdict(list)
+
+    for img_num, (page, img_type, size_str) in sorted(image_info.items()):
+        # Skip smask (alpha channel masks)
+        if img_type == "smask":
+            skipped_smask += 1
+            continue
+
+        # Check if image file exists
+        img_path, ext = get_image_file(img_num)
+        if img_path is None:
+            skipped_no_file += 1
+            continue
+
+        # Skip small images
+        file_size = img_path.stat().st_size
+        if file_size < MIN_FILE_SIZE:
+            skipped_small += 1
+            continue
+
+        # Map page -> CL number
+        if page not in page_to_cl:
+            skipped_no_cl += 1
+            continue
+
+        cl_num = page_to_cl[page]
+
+        # Check if we have a song for this CL
+        if cl_num not in cl_to_song:
+            skipped_no_song += 1
+            continue
+
+        cl_to_images[cl_num].append((img_num, img_path, file_size))
+
+    print(f"    Skipped {skipped_smask} smask images")
+    print(f"    Skipped {skipped_small} images < {MIN_FILE_SIZE // 1024}KB")
+    print(f"    Skipped {skipped_no_file} images with no extracted file")
+    print(f"    Skipped {skipped_no_cl} images on pages without CL number")
+    print(f"    Skipped {skipped_no_song} images whose CL has no song file")
+    print(f"    Found images for {len(cl_to_images)} songs")
+
+    # Step 5: Insert images into songs
+    print("\n[5] Inserting \\songimage commands...")
+    inserted = 0
+    skipped_existing = 0
+    songs_with_multiple = 0
+
+    for cl_num in sorted(cl_to_images.keys()):
+        candidates = cl_to_images[cl_num]
+        song_path = cl_to_song[cl_num]
+
+        # Check if song already has an image
+        content = song_path.read_text(encoding="utf-8")
+        if song_already_has_image(content):
+            skipped_existing += 1
+            continue
+
+        # Pick the largest image by file size
+        if len(candidates) > 1:
+            songs_with_multiple += 1
+        candidates.sort(key=lambda x: x[2], reverse=True)
+        img_num, img_path, file_size = candidates[0]
+
+        # Determine the relative path for the \songimage command
+        ext = img_path.suffix
+        image_rel = f"images/img-{img_num:03d}{ext}"
+
+        # Insert into the song file
+        if insert_songimage(song_path, image_rel):
+            size_kb = file_size / 1024
+            print(f"    CL {cl_num:>3d}: {song_path.name} <- {image_rel} ({size_kb:.0f}KB)")
+            inserted += 1
+        else:
+            skipped_existing += 1
+
+    # Summary
+    print("\n" + "=" * 60)
+    print("Summary")
+    print("=" * 60)
+    print(f"  Images in PDF:            {len(image_info)}")
+    print(f"  Songs with CL numbers:    {len(cl_to_song)}")
+    print(f"  Songs receiving images:   {inserted}")
+    print(f"  Songs already had images: {skipped_existing}")
+    print(f"  Songs with multiple candidates: {songs_with_multiple} (picked largest)")
+    print(f"  Images not mapped to any song: {skipped_no_cl + skipped_no_song}")
+
+
+if __name__ == "__main__":
+    main()