#!/usr/bin/env python3 """ Insert \\songimage{} commands into song .tex files based on the image-to-song mapping extracted from the reference PDF. Steps: 1. Run `pdfimages -list` to get image number -> PDF page mapping 2. Run `pdftotext -layout` to get PDF page -> CL number mapping 3. Read song .tex files to get CL number -> song filename mapping 4. For each image that maps to a song, add \\songimage{images/img-NNN.ext} Idempotent: skips songs that already have \\songimage or \\fillerpage commands. """ import os import re import subprocess import sys from pathlib import Path from collections import defaultdict # --- Configuration --- PROJECT_DIR = Path("/home/shahondin1624/Projects/songbook") SONGS_DIR = PROJECT_DIR / "songs" IMAGES_DIR = PROJECT_DIR / "images" PDF_PATH = Path("/home/shahondin1624/Documents/Pfadfinder/Liederbuch/280824_Liederbuch_CL6_2025.pdf") MIN_FILE_SIZE = 5 * 1024 # 5KB minimum def get_image_page_mapping(): """Run pdfimages -list and return {image_num: (page, type, size_str)}.""" result = subprocess.run( ["pdfimages", "-list", str(PDF_PATH)], capture_output=True, text=True, check=True ) images = {} for line in result.stdout.strip().split("\n"): # Skip header lines if line.startswith("page") or line.startswith("---"): continue parts = line.split() if len(parts) < 14: continue page = int(parts[0]) img_num = int(parts[1]) img_type = parts[2] # "image", "smask" size_str = parts[12] # e.g. "333K", "9107B" images[img_num] = (page, img_type, size_str) return images def parse_size(size_str): """Parse a size string like '333K' or '9107B' into bytes.""" size_str = size_str.strip() if size_str.endswith("K"): return int(float(size_str[:-1]) * 1024) elif size_str.endswith("M"): return int(float(size_str[:-1]) * 1024 * 1024) elif size_str.endswith("B"): return int(float(size_str[:-1])) else: return int(float(size_str)) def get_page_to_cl_mapping(): """Run pdftotext -layout and extract page -> CL number mapping.""" result = subprocess.run( ["pdftotext", "-layout", str(PDF_PATH), "-"], capture_output=True, text=True, check=True ) pages = result.stdout.split("\f") page_to_cl = {} for i in range(len(pages)): lines = pages[i].strip().split("\n") if len(lines) < 2: continue header_line = lines[-2] number_line = lines[-1] if "CL" not in header_line: continue cl_pos = header_line.index("CL") # Find the number closest to the CL column position best_num = None best_dist = float("inf") for m in re.finditer(r"\d+", number_line): dist = abs(m.start() - cl_pos) if dist < 8 and dist < best_dist: best_dist = dist best_num = int(m.group()) if best_num is not None: page_to_cl[i + 1] = best_num return page_to_cl def get_cl_to_song_mapping(): """Read song .tex files and return {cl_number: song_filepath}.""" cl_to_song = {} for tex_file in sorted(SONGS_DIR.glob("*.tex")): content = tex_file.read_text(encoding="utf-8") match = re.search(r"cl\s*=\s*(\d+)", content) if match: cl_num = int(match.group(1)) cl_to_song[cl_num] = tex_file return cl_to_song def get_image_file(img_num): """Find the actual image file for a given image number. Returns (path, ext) or None.""" for ext in ["jpg", "png"]: path = IMAGES_DIR / f"img-{img_num:03d}.{ext}" if path.exists(): return path, ext return None, None def song_already_has_image(content): """Check if the song already has a \\songimage or \\fillerpage command.""" return r"\songimage" in content or r"\fillerpage" in content def insert_songimage(filepath, image_path_rel): """Insert \\songimage{path} into a song file, after last \\end{verse} but before \\end{song}.""" content = filepath.read_text(encoding="utf-8") if song_already_has_image(content): return False # Find the position of \end{song} end_song_match = re.search(r"\\end\{song\}", content) if not end_song_match: print(f" WARNING: No \\end{{song}} found in {filepath.name}") return False end_song_pos = end_song_match.start() # Insert \songimage before \end{song} # Ensure there's a blank line before it for readability insert_text = f"\\songimage{{{image_path_rel}}}\n\n" # Check if there's already a blank line before \end{song} before = content[:end_song_pos] if before.endswith("\n\n"): new_content = before + insert_text + content[end_song_pos:] elif before.endswith("\n"): new_content = before + "\n" + insert_text + content[end_song_pos:] else: new_content = before + "\n\n" + insert_text + content[end_song_pos:] filepath.write_text(new_content, encoding="utf-8") return True def main(): print("=" * 60) print("Image-to-Song Insertion Script") print("=" * 60) # Step 1: Get image -> page mapping from PDF print("\n[1] Running pdfimages -list...") image_info = get_image_page_mapping() print(f" Found {len(image_info)} images in PDF") # Step 2: Get page -> CL mapping from PDF text print("\n[2] Running pdftotext -layout...") page_to_cl = get_page_to_cl_mapping() print(f" Found CL numbers for {len(page_to_cl)} pages") # Step 3: Get CL -> song file mapping print("\n[3] Reading song .tex files...") cl_to_song = get_cl_to_song_mapping() print(f" Found {len(cl_to_song)} songs with CL numbers") # Step 4: Build image -> CL mapping, filtering out smasks and small images print("\n[4] Building image-to-song mapping...") skipped_smask = 0 skipped_small = 0 skipped_no_file = 0 skipped_no_cl = 0 skipped_no_song = 0 # Collect: CL number -> list of (image_num, file_path, file_size) cl_to_images = defaultdict(list) for img_num, (page, img_type, size_str) in sorted(image_info.items()): # Skip smask (alpha channel masks) if img_type == "smask": skipped_smask += 1 continue # Check if image file exists img_path, ext = get_image_file(img_num) if img_path is None: skipped_no_file += 1 continue # Skip small images file_size = img_path.stat().st_size if file_size < MIN_FILE_SIZE: skipped_small += 1 continue # Map page -> CL number if page not in page_to_cl: skipped_no_cl += 1 continue cl_num = page_to_cl[page] # Check if we have a song for this CL if cl_num not in cl_to_song: skipped_no_song += 1 continue cl_to_images[cl_num].append((img_num, img_path, file_size)) print(f" Skipped {skipped_smask} smask images") print(f" Skipped {skipped_small} images < {MIN_FILE_SIZE // 1024}KB") print(f" Skipped {skipped_no_file} images with no extracted file") print(f" Skipped {skipped_no_cl} images on pages without CL number") print(f" Skipped {skipped_no_song} images whose CL has no song file") print(f" Found images for {len(cl_to_images)} songs") # Step 5: Insert images into songs print("\n[5] Inserting \\songimage commands...") inserted = 0 skipped_existing = 0 songs_with_multiple = 0 for cl_num in sorted(cl_to_images.keys()): candidates = cl_to_images[cl_num] song_path = cl_to_song[cl_num] # Check if song already has an image content = song_path.read_text(encoding="utf-8") if song_already_has_image(content): skipped_existing += 1 continue # Pick the largest image by file size if len(candidates) > 1: songs_with_multiple += 1 candidates.sort(key=lambda x: x[2], reverse=True) img_num, img_path, file_size = candidates[0] # Determine the relative path for the \songimage command ext = img_path.suffix image_rel = f"images/img-{img_num:03d}{ext}" # Insert into the song file if insert_songimage(song_path, image_rel): size_kb = file_size / 1024 print(f" CL {cl_num:>3d}: {song_path.name} <- {image_rel} ({size_kb:.0f}KB)") inserted += 1 else: skipped_existing += 1 # Summary print("\n" + "=" * 60) print("Summary") print("=" * 60) print(f" Images in PDF: {len(image_info)}") print(f" Songs with CL numbers: {len(cl_to_song)}") print(f" Songs receiving images: {inserted}") print(f" Songs already had images: {skipped_existing}") print(f" Songs with multiple candidates: {songs_with_multiple} (picked largest)") print(f" Images not mapped to any song: {skipped_no_cl + skipped_no_song}") if __name__ == "__main__": main()