songbook/insert-images.py

#!/usr/bin/env python3
"""
Insert \\songimage{} commands into song .tex files based on the image-to-song
mapping extracted from the reference PDF.

Steps:
  1. Run `pdfimages -list` to get image number -> PDF page mapping
  2. Run `pdftotext -layout` to get PDF page -> CL number mapping
  3. Read song .tex files to get CL number -> song filename mapping
  4. For each image that maps to a song, add \\songimage{images/img-NNN.ext}

Idempotent: skips songs that already have \\songimage or \\fillerpage commands.
"""

import os
import re
import subprocess
import sys
from pathlib import Path
from collections import defaultdict

# --- Configuration ---
PROJECT_DIR = Path("/home/shahondin1624/Projects/songbook")
SONGS_DIR = PROJECT_DIR / "songs"
IMAGES_DIR = PROJECT_DIR / "images"
PDF_PATH = Path("/home/shahondin1624/Documents/Pfadfinder/Liederbuch/280824_Liederbuch_CL6_2025.pdf")
MIN_FILE_SIZE = 5 * 1024  # 5KB minimum


def get_image_page_mapping():
    """Run pdfimages -list and return {image_num: (page, type, size_str)}."""
    result = subprocess.run(
        ["pdfimages", "-list", str(PDF_PATH)],
        capture_output=True, text=True, check=True
    )
    images = {}
    for line in result.stdout.strip().split("\n"):
        # Skip header lines
        if line.startswith("page") or line.startswith("---"):
            continue
        parts = line.split()
        if len(parts) < 14:
            continue
        page = int(parts[0])
        img_num = int(parts[1])
        img_type = parts[2]  # "image", "smask"
        size_str = parts[12]  # e.g. "333K", "9107B"
        images[img_num] = (page, img_type, size_str)
    return images


def parse_size(size_str):
    """Parse a size string like '333K' or '9107B' into bytes."""
    size_str = size_str.strip()
    if size_str.endswith("K"):
        return int(float(size_str[:-1]) * 1024)
    elif size_str.endswith("M"):
        return int(float(size_str[:-1]) * 1024 * 1024)
    elif size_str.endswith("B"):
        return int(float(size_str[:-1]))
    else:
        return int(float(size_str))


def get_page_to_cl_mapping():
    """Run pdftotext -layout and extract page -> CL number mapping."""
    result = subprocess.run(
        ["pdftotext", "-layout", str(PDF_PATH), "-"],
        capture_output=True, text=True, check=True
    )
    pages = result.stdout.split("\f")
    page_to_cl = {}

    for i in range(len(pages)):
        lines = pages[i].strip().split("\n")
        if len(lines) < 2:
            continue

        header_line = lines[-2]
        number_line = lines[-1]

        if "CL" not in header_line:
            continue

        cl_pos = header_line.index("CL")

        # Find the number closest to the CL column position
        best_num = None
        best_dist = float("inf")
        for m in re.finditer(r"\d+", number_line):
            dist = abs(m.start() - cl_pos)
            if dist < 8 and dist < best_dist:
                best_dist = dist
                best_num = int(m.group())

        if best_num is not None:
            page_to_cl[i + 1] = best_num

    return page_to_cl


def get_cl_to_song_mapping():
    """Read song .tex files and return {cl_number: song_filepath}."""
    cl_to_song = {}
    for tex_file in sorted(SONGS_DIR.glob("*.tex")):
        content = tex_file.read_text(encoding="utf-8")
        match = re.search(r"cl\s*=\s*(\d+)", content)
        if match:
            cl_num = int(match.group(1))
            cl_to_song[cl_num] = tex_file
    return cl_to_song


def get_image_file(img_num):
    """Find the actual image file for a given image number. Returns (path, ext) or None."""
    for ext in ["jpg", "png"]:
        path = IMAGES_DIR / f"img-{img_num:03d}.{ext}"
        if path.exists():
            return path, ext
    return None, None


def song_already_has_image(content):
    """Check if the song already has a \\songimage or \\fillerpage command."""
    return r"\songimage" in content or r"\fillerpage" in content


def insert_songimage(filepath, image_path_rel):
    """Insert \\songimage{path} into a song file, after last \\end{verse} but before \\end{song}."""
    content = filepath.read_text(encoding="utf-8")

    if song_already_has_image(content):
        return False

    # Find the position of \end{song}
    end_song_match = re.search(r"\\end\{song\}", content)
    if not end_song_match:
        print(f"  WARNING: No \\end{{song}} found in {filepath.name}")
        return False

    end_song_pos = end_song_match.start()

    # Insert \songimage before \end{song}
    # Ensure there's a blank line before it for readability
    insert_text = f"\\songimage{{{image_path_rel}}}\n\n"

    # Check if there's already a blank line before \end{song}
    before = content[:end_song_pos]
    if before.endswith("\n\n"):
        new_content = before + insert_text + content[end_song_pos:]
    elif before.endswith("\n"):
        new_content = before + "\n" + insert_text + content[end_song_pos:]
    else:
        new_content = before + "\n\n" + insert_text + content[end_song_pos:]

    filepath.write_text(new_content, encoding="utf-8")
    return True


def main():
    print("=" * 60)
    print("Image-to-Song Insertion Script")
    print("=" * 60)

    # Step 1: Get image -> page mapping from PDF
    print("\n[1] Running pdfimages -list...")
    image_info = get_image_page_mapping()
    print(f"    Found {len(image_info)} images in PDF")

    # Step 2: Get page -> CL mapping from PDF text
    print("\n[2] Running pdftotext -layout...")
    page_to_cl = get_page_to_cl_mapping()
    print(f"    Found CL numbers for {len(page_to_cl)} pages")

    # Step 3: Get CL -> song file mapping
    print("\n[3] Reading song .tex files...")
    cl_to_song = get_cl_to_song_mapping()
    print(f"    Found {len(cl_to_song)} songs with CL numbers")

    # Step 4: Build image -> CL mapping, filtering out smasks and small images
    print("\n[4] Building image-to-song mapping...")
    skipped_smask = 0
    skipped_small = 0
    skipped_no_file = 0
    skipped_no_cl = 0
    skipped_no_song = 0

    # Collect: CL number -> list of (image_num, file_path, file_size)
    cl_to_images = defaultdict(list)

    for img_num, (page, img_type, size_str) in sorted(image_info.items()):
        # Skip smask (alpha channel masks)
        if img_type == "smask":
            skipped_smask += 1
            continue

        # Check if image file exists
        img_path, ext = get_image_file(img_num)
        if img_path is None:
            skipped_no_file += 1
            continue

        # Skip small images
        file_size = img_path.stat().st_size
        if file_size < MIN_FILE_SIZE:
            skipped_small += 1
            continue

        # Map page -> CL number
        if page not in page_to_cl:
            skipped_no_cl += 1
            continue

        cl_num = page_to_cl[page]

        # Check if we have a song for this CL
        if cl_num not in cl_to_song:
            skipped_no_song += 1
            continue

        cl_to_images[cl_num].append((img_num, img_path, file_size))

    print(f"    Skipped {skipped_smask} smask images")
    print(f"    Skipped {skipped_small} images < {MIN_FILE_SIZE // 1024}KB")
    print(f"    Skipped {skipped_no_file} images with no extracted file")
    print(f"    Skipped {skipped_no_cl} images on pages without CL number")
    print(f"    Skipped {skipped_no_song} images whose CL has no song file")
    print(f"    Found images for {len(cl_to_images)} songs")

    # Step 5: Insert images into songs
    print("\n[5] Inserting \\songimage commands...")
    inserted = 0
    skipped_existing = 0
    songs_with_multiple = 0

    for cl_num in sorted(cl_to_images.keys()):
        candidates = cl_to_images[cl_num]
        song_path = cl_to_song[cl_num]

        # Check if song already has an image
        content = song_path.read_text(encoding="utf-8")
        if song_already_has_image(content):
            skipped_existing += 1
            continue

        # Pick the largest image by file size
        if len(candidates) > 1:
            songs_with_multiple += 1
        candidates.sort(key=lambda x: x[2], reverse=True)
        img_num, img_path, file_size = candidates[0]

        # Determine the relative path for the \songimage command
        ext = img_path.suffix
        image_rel = f"images/img-{img_num:03d}{ext}"

        # Insert into the song file
        if insert_songimage(song_path, image_rel):
            size_kb = file_size / 1024
            print(f"    CL {cl_num:>3d}: {song_path.name} <- {image_rel} ({size_kb:.0f}KB)")
            inserted += 1
        else:
            skipped_existing += 1

    # Summary
    print("\n" + "=" * 60)
    print("Summary")
    print("=" * 60)
    print(f"  Images in PDF:            {len(image_info)}")
    print(f"  Songs with CL numbers:    {len(cl_to_song)}")
    print(f"  Songs receiving images:   {inserted}")
    print(f"  Songs already had images: {skipped_existing}")
    print(f"  Songs with multiple candidates: {songs_with_multiple} (picked largest)")
    print(f"  Images not mapped to any song: {skipped_no_cl + skipped_no_song}")


if __name__ == "__main__":
    main()