Files
songbook/insert-images.py
shahondin1624 0e8660cd41 Extract and insert 97 images from reference PDF into songs
Extract images from the CL6 PDF using pdfimages, map them to songs
via page-to-CL number matching, and insert \songimage commands.
Add insert-images.py script for repeatable extraction.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 19:03:07 +02:00

278 lines
9.0 KiB
Python

#!/usr/bin/env python3
"""
Insert \\songimage{} commands into song .tex files based on the image-to-song
mapping extracted from the reference PDF.
Steps:
1. Run `pdfimages -list` to get image number -> PDF page mapping
2. Run `pdftotext -layout` to get PDF page -> CL number mapping
3. Read song .tex files to get CL number -> song filename mapping
4. For each image that maps to a song, add \\songimage{images/img-NNN.ext}
Idempotent: skips songs that already have \\songimage or \\fillerpage commands.
"""
import os
import re
import subprocess
import sys
from pathlib import Path
from collections import defaultdict
# --- Configuration ---
PROJECT_DIR = Path("/home/shahondin1624/Projects/songbook")
SONGS_DIR = PROJECT_DIR / "songs"
IMAGES_DIR = PROJECT_DIR / "images"
PDF_PATH = Path("/home/shahondin1624/Documents/Pfadfinder/Liederbuch/280824_Liederbuch_CL6_2025.pdf")
MIN_FILE_SIZE = 5 * 1024 # 5KB minimum
def get_image_page_mapping():
"""Run pdfimages -list and return {image_num: (page, type, size_str)}."""
result = subprocess.run(
["pdfimages", "-list", str(PDF_PATH)],
capture_output=True, text=True, check=True
)
images = {}
for line in result.stdout.strip().split("\n"):
# Skip header lines
if line.startswith("page") or line.startswith("---"):
continue
parts = line.split()
if len(parts) < 14:
continue
page = int(parts[0])
img_num = int(parts[1])
img_type = parts[2] # "image", "smask"
size_str = parts[12] # e.g. "333K", "9107B"
images[img_num] = (page, img_type, size_str)
return images
def parse_size(size_str):
"""Parse a size string like '333K' or '9107B' into bytes."""
size_str = size_str.strip()
if size_str.endswith("K"):
return int(float(size_str[:-1]) * 1024)
elif size_str.endswith("M"):
return int(float(size_str[:-1]) * 1024 * 1024)
elif size_str.endswith("B"):
return int(float(size_str[:-1]))
else:
return int(float(size_str))
def get_page_to_cl_mapping():
"""Run pdftotext -layout and extract page -> CL number mapping."""
result = subprocess.run(
["pdftotext", "-layout", str(PDF_PATH), "-"],
capture_output=True, text=True, check=True
)
pages = result.stdout.split("\f")
page_to_cl = {}
for i in range(len(pages)):
lines = pages[i].strip().split("\n")
if len(lines) < 2:
continue
header_line = lines[-2]
number_line = lines[-1]
if "CL" not in header_line:
continue
cl_pos = header_line.index("CL")
# Find the number closest to the CL column position
best_num = None
best_dist = float("inf")
for m in re.finditer(r"\d+", number_line):
dist = abs(m.start() - cl_pos)
if dist < 8 and dist < best_dist:
best_dist = dist
best_num = int(m.group())
if best_num is not None:
page_to_cl[i + 1] = best_num
return page_to_cl
def get_cl_to_song_mapping():
"""Read song .tex files and return {cl_number: song_filepath}."""
cl_to_song = {}
for tex_file in sorted(SONGS_DIR.glob("*.tex")):
content = tex_file.read_text(encoding="utf-8")
match = re.search(r"cl\s*=\s*(\d+)", content)
if match:
cl_num = int(match.group(1))
cl_to_song[cl_num] = tex_file
return cl_to_song
def get_image_file(img_num):
"""Find the actual image file for a given image number. Returns (path, ext) or None."""
for ext in ["jpg", "png"]:
path = IMAGES_DIR / f"img-{img_num:03d}.{ext}"
if path.exists():
return path, ext
return None, None
def song_already_has_image(content):
"""Check if the song already has a \\songimage or \\fillerpage command."""
return r"\songimage" in content or r"\fillerpage" in content
def insert_songimage(filepath, image_path_rel):
"""Insert \\songimage{path} into a song file, after last \\end{verse} but before \\end{song}."""
content = filepath.read_text(encoding="utf-8")
if song_already_has_image(content):
return False
# Find the position of \end{song}
end_song_match = re.search(r"\\end\{song\}", content)
if not end_song_match:
print(f" WARNING: No \\end{{song}} found in {filepath.name}")
return False
end_song_pos = end_song_match.start()
# Insert \songimage before \end{song}
# Ensure there's a blank line before it for readability
insert_text = f"\\songimage{{{image_path_rel}}}\n\n"
# Check if there's already a blank line before \end{song}
before = content[:end_song_pos]
if before.endswith("\n\n"):
new_content = before + insert_text + content[end_song_pos:]
elif before.endswith("\n"):
new_content = before + "\n" + insert_text + content[end_song_pos:]
else:
new_content = before + "\n\n" + insert_text + content[end_song_pos:]
filepath.write_text(new_content, encoding="utf-8")
return True
def main():
print("=" * 60)
print("Image-to-Song Insertion Script")
print("=" * 60)
# Step 1: Get image -> page mapping from PDF
print("\n[1] Running pdfimages -list...")
image_info = get_image_page_mapping()
print(f" Found {len(image_info)} images in PDF")
# Step 2: Get page -> CL mapping from PDF text
print("\n[2] Running pdftotext -layout...")
page_to_cl = get_page_to_cl_mapping()
print(f" Found CL numbers for {len(page_to_cl)} pages")
# Step 3: Get CL -> song file mapping
print("\n[3] Reading song .tex files...")
cl_to_song = get_cl_to_song_mapping()
print(f" Found {len(cl_to_song)} songs with CL numbers")
# Step 4: Build image -> CL mapping, filtering out smasks and small images
print("\n[4] Building image-to-song mapping...")
skipped_smask = 0
skipped_small = 0
skipped_no_file = 0
skipped_no_cl = 0
skipped_no_song = 0
# Collect: CL number -> list of (image_num, file_path, file_size)
cl_to_images = defaultdict(list)
for img_num, (page, img_type, size_str) in sorted(image_info.items()):
# Skip smask (alpha channel masks)
if img_type == "smask":
skipped_smask += 1
continue
# Check if image file exists
img_path, ext = get_image_file(img_num)
if img_path is None:
skipped_no_file += 1
continue
# Skip small images
file_size = img_path.stat().st_size
if file_size < MIN_FILE_SIZE:
skipped_small += 1
continue
# Map page -> CL number
if page not in page_to_cl:
skipped_no_cl += 1
continue
cl_num = page_to_cl[page]
# Check if we have a song for this CL
if cl_num not in cl_to_song:
skipped_no_song += 1
continue
cl_to_images[cl_num].append((img_num, img_path, file_size))
print(f" Skipped {skipped_smask} smask images")
print(f" Skipped {skipped_small} images < {MIN_FILE_SIZE // 1024}KB")
print(f" Skipped {skipped_no_file} images with no extracted file")
print(f" Skipped {skipped_no_cl} images on pages without CL number")
print(f" Skipped {skipped_no_song} images whose CL has no song file")
print(f" Found images for {len(cl_to_images)} songs")
# Step 5: Insert images into songs
print("\n[5] Inserting \\songimage commands...")
inserted = 0
skipped_existing = 0
songs_with_multiple = 0
for cl_num in sorted(cl_to_images.keys()):
candidates = cl_to_images[cl_num]
song_path = cl_to_song[cl_num]
# Check if song already has an image
content = song_path.read_text(encoding="utf-8")
if song_already_has_image(content):
skipped_existing += 1
continue
# Pick the largest image by file size
if len(candidates) > 1:
songs_with_multiple += 1
candidates.sort(key=lambda x: x[2], reverse=True)
img_num, img_path, file_size = candidates[0]
# Determine the relative path for the \songimage command
ext = img_path.suffix
image_rel = f"images/img-{img_num:03d}{ext}"
# Insert into the song file
if insert_songimage(song_path, image_rel):
size_kb = file_size / 1024
print(f" CL {cl_num:>3d}: {song_path.name} <- {image_rel} ({size_kb:.0f}KB)")
inserted += 1
else:
skipped_existing += 1
# Summary
print("\n" + "=" * 60)
print("Summary")
print("=" * 60)
print(f" Images in PDF: {len(image_info)}")
print(f" Songs with CL numbers: {len(cl_to_song)}")
print(f" Songs receiving images: {inserted}")
print(f" Songs already had images: {skipped_existing}")
print(f" Songs with multiple candidates: {songs_with_multiple} (picked largest)")
print(f" Images not mapped to any song: {skipped_no_cl + skipped_no_song}")
if __name__ == "__main__":
main()