Extract images from the CL6 PDF using pdfimages, map them to songs via page-to-CL number matching, and insert \songimage commands. Add insert-images.py script for repeatable extraction. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
278 lines
9.0 KiB
Python
278 lines
9.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Insert \\songimage{} commands into song .tex files based on the image-to-song
|
|
mapping extracted from the reference PDF.
|
|
|
|
Steps:
|
|
1. Run `pdfimages -list` to get image number -> PDF page mapping
|
|
2. Run `pdftotext -layout` to get PDF page -> CL number mapping
|
|
3. Read song .tex files to get CL number -> song filename mapping
|
|
4. For each image that maps to a song, add \\songimage{images/img-NNN.ext}
|
|
|
|
Idempotent: skips songs that already have \\songimage or \\fillerpage commands.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
# --- Configuration ---
|
|
PROJECT_DIR = Path("/home/shahondin1624/Projects/songbook")
|
|
SONGS_DIR = PROJECT_DIR / "songs"
|
|
IMAGES_DIR = PROJECT_DIR / "images"
|
|
PDF_PATH = Path("/home/shahondin1624/Documents/Pfadfinder/Liederbuch/280824_Liederbuch_CL6_2025.pdf")
|
|
MIN_FILE_SIZE = 5 * 1024 # 5KB minimum
|
|
|
|
|
|
def get_image_page_mapping():
|
|
"""Run pdfimages -list and return {image_num: (page, type, size_str)}."""
|
|
result = subprocess.run(
|
|
["pdfimages", "-list", str(PDF_PATH)],
|
|
capture_output=True, text=True, check=True
|
|
)
|
|
images = {}
|
|
for line in result.stdout.strip().split("\n"):
|
|
# Skip header lines
|
|
if line.startswith("page") or line.startswith("---"):
|
|
continue
|
|
parts = line.split()
|
|
if len(parts) < 14:
|
|
continue
|
|
page = int(parts[0])
|
|
img_num = int(parts[1])
|
|
img_type = parts[2] # "image", "smask"
|
|
size_str = parts[12] # e.g. "333K", "9107B"
|
|
images[img_num] = (page, img_type, size_str)
|
|
return images
|
|
|
|
|
|
def parse_size(size_str):
|
|
"""Parse a size string like '333K' or '9107B' into bytes."""
|
|
size_str = size_str.strip()
|
|
if size_str.endswith("K"):
|
|
return int(float(size_str[:-1]) * 1024)
|
|
elif size_str.endswith("M"):
|
|
return int(float(size_str[:-1]) * 1024 * 1024)
|
|
elif size_str.endswith("B"):
|
|
return int(float(size_str[:-1]))
|
|
else:
|
|
return int(float(size_str))
|
|
|
|
|
|
def get_page_to_cl_mapping():
|
|
"""Run pdftotext -layout and extract page -> CL number mapping."""
|
|
result = subprocess.run(
|
|
["pdftotext", "-layout", str(PDF_PATH), "-"],
|
|
capture_output=True, text=True, check=True
|
|
)
|
|
pages = result.stdout.split("\f")
|
|
page_to_cl = {}
|
|
|
|
for i in range(len(pages)):
|
|
lines = pages[i].strip().split("\n")
|
|
if len(lines) < 2:
|
|
continue
|
|
|
|
header_line = lines[-2]
|
|
number_line = lines[-1]
|
|
|
|
if "CL" not in header_line:
|
|
continue
|
|
|
|
cl_pos = header_line.index("CL")
|
|
|
|
# Find the number closest to the CL column position
|
|
best_num = None
|
|
best_dist = float("inf")
|
|
for m in re.finditer(r"\d+", number_line):
|
|
dist = abs(m.start() - cl_pos)
|
|
if dist < 8 and dist < best_dist:
|
|
best_dist = dist
|
|
best_num = int(m.group())
|
|
|
|
if best_num is not None:
|
|
page_to_cl[i + 1] = best_num
|
|
|
|
return page_to_cl
|
|
|
|
|
|
def get_cl_to_song_mapping():
|
|
"""Read song .tex files and return {cl_number: song_filepath}."""
|
|
cl_to_song = {}
|
|
for tex_file in sorted(SONGS_DIR.glob("*.tex")):
|
|
content = tex_file.read_text(encoding="utf-8")
|
|
match = re.search(r"cl\s*=\s*(\d+)", content)
|
|
if match:
|
|
cl_num = int(match.group(1))
|
|
cl_to_song[cl_num] = tex_file
|
|
return cl_to_song
|
|
|
|
|
|
def get_image_file(img_num):
|
|
"""Find the actual image file for a given image number. Returns (path, ext) or None."""
|
|
for ext in ["jpg", "png"]:
|
|
path = IMAGES_DIR / f"img-{img_num:03d}.{ext}"
|
|
if path.exists():
|
|
return path, ext
|
|
return None, None
|
|
|
|
|
|
def song_already_has_image(content):
|
|
"""Check if the song already has a \\songimage or \\fillerpage command."""
|
|
return r"\songimage" in content or r"\fillerpage" in content
|
|
|
|
|
|
def insert_songimage(filepath, image_path_rel):
|
|
"""Insert \\songimage{path} into a song file, after last \\end{verse} but before \\end{song}."""
|
|
content = filepath.read_text(encoding="utf-8")
|
|
|
|
if song_already_has_image(content):
|
|
return False
|
|
|
|
# Find the position of \end{song}
|
|
end_song_match = re.search(r"\\end\{song\}", content)
|
|
if not end_song_match:
|
|
print(f" WARNING: No \\end{{song}} found in {filepath.name}")
|
|
return False
|
|
|
|
end_song_pos = end_song_match.start()
|
|
|
|
# Insert \songimage before \end{song}
|
|
# Ensure there's a blank line before it for readability
|
|
insert_text = f"\\songimage{{{image_path_rel}}}\n\n"
|
|
|
|
# Check if there's already a blank line before \end{song}
|
|
before = content[:end_song_pos]
|
|
if before.endswith("\n\n"):
|
|
new_content = before + insert_text + content[end_song_pos:]
|
|
elif before.endswith("\n"):
|
|
new_content = before + "\n" + insert_text + content[end_song_pos:]
|
|
else:
|
|
new_content = before + "\n\n" + insert_text + content[end_song_pos:]
|
|
|
|
filepath.write_text(new_content, encoding="utf-8")
|
|
return True
|
|
|
|
|
|
def main():
|
|
print("=" * 60)
|
|
print("Image-to-Song Insertion Script")
|
|
print("=" * 60)
|
|
|
|
# Step 1: Get image -> page mapping from PDF
|
|
print("\n[1] Running pdfimages -list...")
|
|
image_info = get_image_page_mapping()
|
|
print(f" Found {len(image_info)} images in PDF")
|
|
|
|
# Step 2: Get page -> CL mapping from PDF text
|
|
print("\n[2] Running pdftotext -layout...")
|
|
page_to_cl = get_page_to_cl_mapping()
|
|
print(f" Found CL numbers for {len(page_to_cl)} pages")
|
|
|
|
# Step 3: Get CL -> song file mapping
|
|
print("\n[3] Reading song .tex files...")
|
|
cl_to_song = get_cl_to_song_mapping()
|
|
print(f" Found {len(cl_to_song)} songs with CL numbers")
|
|
|
|
# Step 4: Build image -> CL mapping, filtering out smasks and small images
|
|
print("\n[4] Building image-to-song mapping...")
|
|
skipped_smask = 0
|
|
skipped_small = 0
|
|
skipped_no_file = 0
|
|
skipped_no_cl = 0
|
|
skipped_no_song = 0
|
|
|
|
# Collect: CL number -> list of (image_num, file_path, file_size)
|
|
cl_to_images = defaultdict(list)
|
|
|
|
for img_num, (page, img_type, size_str) in sorted(image_info.items()):
|
|
# Skip smask (alpha channel masks)
|
|
if img_type == "smask":
|
|
skipped_smask += 1
|
|
continue
|
|
|
|
# Check if image file exists
|
|
img_path, ext = get_image_file(img_num)
|
|
if img_path is None:
|
|
skipped_no_file += 1
|
|
continue
|
|
|
|
# Skip small images
|
|
file_size = img_path.stat().st_size
|
|
if file_size < MIN_FILE_SIZE:
|
|
skipped_small += 1
|
|
continue
|
|
|
|
# Map page -> CL number
|
|
if page not in page_to_cl:
|
|
skipped_no_cl += 1
|
|
continue
|
|
|
|
cl_num = page_to_cl[page]
|
|
|
|
# Check if we have a song for this CL
|
|
if cl_num not in cl_to_song:
|
|
skipped_no_song += 1
|
|
continue
|
|
|
|
cl_to_images[cl_num].append((img_num, img_path, file_size))
|
|
|
|
print(f" Skipped {skipped_smask} smask images")
|
|
print(f" Skipped {skipped_small} images < {MIN_FILE_SIZE // 1024}KB")
|
|
print(f" Skipped {skipped_no_file} images with no extracted file")
|
|
print(f" Skipped {skipped_no_cl} images on pages without CL number")
|
|
print(f" Skipped {skipped_no_song} images whose CL has no song file")
|
|
print(f" Found images for {len(cl_to_images)} songs")
|
|
|
|
# Step 5: Insert images into songs
|
|
print("\n[5] Inserting \\songimage commands...")
|
|
inserted = 0
|
|
skipped_existing = 0
|
|
songs_with_multiple = 0
|
|
|
|
for cl_num in sorted(cl_to_images.keys()):
|
|
candidates = cl_to_images[cl_num]
|
|
song_path = cl_to_song[cl_num]
|
|
|
|
# Check if song already has an image
|
|
content = song_path.read_text(encoding="utf-8")
|
|
if song_already_has_image(content):
|
|
skipped_existing += 1
|
|
continue
|
|
|
|
# Pick the largest image by file size
|
|
if len(candidates) > 1:
|
|
songs_with_multiple += 1
|
|
candidates.sort(key=lambda x: x[2], reverse=True)
|
|
img_num, img_path, file_size = candidates[0]
|
|
|
|
# Determine the relative path for the \songimage command
|
|
ext = img_path.suffix
|
|
image_rel = f"images/img-{img_num:03d}{ext}"
|
|
|
|
# Insert into the song file
|
|
if insert_songimage(song_path, image_rel):
|
|
size_kb = file_size / 1024
|
|
print(f" CL {cl_num:>3d}: {song_path.name} <- {image_rel} ({size_kb:.0f}KB)")
|
|
inserted += 1
|
|
else:
|
|
skipped_existing += 1
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("Summary")
|
|
print("=" * 60)
|
|
print(f" Images in PDF: {len(image_info)}")
|
|
print(f" Songs with CL numbers: {len(cl_to_song)}")
|
|
print(f" Songs receiving images: {inserted}")
|
|
print(f" Songs already had images: {skipped_existing}")
|
|
print(f" Songs with multiple candidates: {songs_with_multiple} (picked largest)")
|
|
print(f" Images not mapped to any song: {skipped_no_cl + skipped_no_song}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|