Extract and insert 97 images from reference PDF into songs
Extract images from the CL6 PDF using pdfimages, map them to songs via page-to-CL number matching, and insert \songimage commands. Add insert-images.py script for repeatable extraction. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
277
insert-images.py
Normal file
277
insert-images.py
Normal file
@@ -0,0 +1,277 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Insert \\songimage{} commands into song .tex files based on the image-to-song
|
||||
mapping extracted from the reference PDF.
|
||||
|
||||
Steps:
|
||||
1. Run `pdfimages -list` to get image number -> PDF page mapping
|
||||
2. Run `pdftotext -layout` to get PDF page -> CL number mapping
|
||||
3. Read song .tex files to get CL number -> song filename mapping
|
||||
4. For each image that maps to a song, add \\songimage{images/img-NNN.ext}
|
||||
|
||||
Idempotent: skips songs that already have \\songimage or \\fillerpage commands.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
# --- Configuration ---
|
||||
PROJECT_DIR = Path("/home/shahondin1624/Projects/songbook")
|
||||
SONGS_DIR = PROJECT_DIR / "songs"
|
||||
IMAGES_DIR = PROJECT_DIR / "images"
|
||||
PDF_PATH = Path("/home/shahondin1624/Documents/Pfadfinder/Liederbuch/280824_Liederbuch_CL6_2025.pdf")
|
||||
MIN_FILE_SIZE = 5 * 1024 # 5KB minimum
|
||||
|
||||
|
||||
def get_image_page_mapping():
|
||||
"""Run pdfimages -list and return {image_num: (page, type, size_str)}."""
|
||||
result = subprocess.run(
|
||||
["pdfimages", "-list", str(PDF_PATH)],
|
||||
capture_output=True, text=True, check=True
|
||||
)
|
||||
images = {}
|
||||
for line in result.stdout.strip().split("\n"):
|
||||
# Skip header lines
|
||||
if line.startswith("page") or line.startswith("---"):
|
||||
continue
|
||||
parts = line.split()
|
||||
if len(parts) < 14:
|
||||
continue
|
||||
page = int(parts[0])
|
||||
img_num = int(parts[1])
|
||||
img_type = parts[2] # "image", "smask"
|
||||
size_str = parts[12] # e.g. "333K", "9107B"
|
||||
images[img_num] = (page, img_type, size_str)
|
||||
return images
|
||||
|
||||
|
||||
def parse_size(size_str):
|
||||
"""Parse a size string like '333K' or '9107B' into bytes."""
|
||||
size_str = size_str.strip()
|
||||
if size_str.endswith("K"):
|
||||
return int(float(size_str[:-1]) * 1024)
|
||||
elif size_str.endswith("M"):
|
||||
return int(float(size_str[:-1]) * 1024 * 1024)
|
||||
elif size_str.endswith("B"):
|
||||
return int(float(size_str[:-1]))
|
||||
else:
|
||||
return int(float(size_str))
|
||||
|
||||
|
||||
def get_page_to_cl_mapping():
|
||||
"""Run pdftotext -layout and extract page -> CL number mapping."""
|
||||
result = subprocess.run(
|
||||
["pdftotext", "-layout", str(PDF_PATH), "-"],
|
||||
capture_output=True, text=True, check=True
|
||||
)
|
||||
pages = result.stdout.split("\f")
|
||||
page_to_cl = {}
|
||||
|
||||
for i in range(len(pages)):
|
||||
lines = pages[i].strip().split("\n")
|
||||
if len(lines) < 2:
|
||||
continue
|
||||
|
||||
header_line = lines[-2]
|
||||
number_line = lines[-1]
|
||||
|
||||
if "CL" not in header_line:
|
||||
continue
|
||||
|
||||
cl_pos = header_line.index("CL")
|
||||
|
||||
# Find the number closest to the CL column position
|
||||
best_num = None
|
||||
best_dist = float("inf")
|
||||
for m in re.finditer(r"\d+", number_line):
|
||||
dist = abs(m.start() - cl_pos)
|
||||
if dist < 8 and dist < best_dist:
|
||||
best_dist = dist
|
||||
best_num = int(m.group())
|
||||
|
||||
if best_num is not None:
|
||||
page_to_cl[i + 1] = best_num
|
||||
|
||||
return page_to_cl
|
||||
|
||||
|
||||
def get_cl_to_song_mapping():
|
||||
"""Read song .tex files and return {cl_number: song_filepath}."""
|
||||
cl_to_song = {}
|
||||
for tex_file in sorted(SONGS_DIR.glob("*.tex")):
|
||||
content = tex_file.read_text(encoding="utf-8")
|
||||
match = re.search(r"cl\s*=\s*(\d+)", content)
|
||||
if match:
|
||||
cl_num = int(match.group(1))
|
||||
cl_to_song[cl_num] = tex_file
|
||||
return cl_to_song
|
||||
|
||||
|
||||
def get_image_file(img_num):
|
||||
"""Find the actual image file for a given image number. Returns (path, ext) or None."""
|
||||
for ext in ["jpg", "png"]:
|
||||
path = IMAGES_DIR / f"img-{img_num:03d}.{ext}"
|
||||
if path.exists():
|
||||
return path, ext
|
||||
return None, None
|
||||
|
||||
|
||||
def song_already_has_image(content):
|
||||
"""Check if the song already has a \\songimage or \\fillerpage command."""
|
||||
return r"\songimage" in content or r"\fillerpage" in content
|
||||
|
||||
|
||||
def insert_songimage(filepath, image_path_rel):
|
||||
"""Insert \\songimage{path} into a song file, after last \\end{verse} but before \\end{song}."""
|
||||
content = filepath.read_text(encoding="utf-8")
|
||||
|
||||
if song_already_has_image(content):
|
||||
return False
|
||||
|
||||
# Find the position of \end{song}
|
||||
end_song_match = re.search(r"\\end\{song\}", content)
|
||||
if not end_song_match:
|
||||
print(f" WARNING: No \\end{{song}} found in {filepath.name}")
|
||||
return False
|
||||
|
||||
end_song_pos = end_song_match.start()
|
||||
|
||||
# Insert \songimage before \end{song}
|
||||
# Ensure there's a blank line before it for readability
|
||||
insert_text = f"\\songimage{{{image_path_rel}}}\n\n"
|
||||
|
||||
# Check if there's already a blank line before \end{song}
|
||||
before = content[:end_song_pos]
|
||||
if before.endswith("\n\n"):
|
||||
new_content = before + insert_text + content[end_song_pos:]
|
||||
elif before.endswith("\n"):
|
||||
new_content = before + "\n" + insert_text + content[end_song_pos:]
|
||||
else:
|
||||
new_content = before + "\n\n" + insert_text + content[end_song_pos:]
|
||||
|
||||
filepath.write_text(new_content, encoding="utf-8")
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Image-to-Song Insertion Script")
|
||||
print("=" * 60)
|
||||
|
||||
# Step 1: Get image -> page mapping from PDF
|
||||
print("\n[1] Running pdfimages -list...")
|
||||
image_info = get_image_page_mapping()
|
||||
print(f" Found {len(image_info)} images in PDF")
|
||||
|
||||
# Step 2: Get page -> CL mapping from PDF text
|
||||
print("\n[2] Running pdftotext -layout...")
|
||||
page_to_cl = get_page_to_cl_mapping()
|
||||
print(f" Found CL numbers for {len(page_to_cl)} pages")
|
||||
|
||||
# Step 3: Get CL -> song file mapping
|
||||
print("\n[3] Reading song .tex files...")
|
||||
cl_to_song = get_cl_to_song_mapping()
|
||||
print(f" Found {len(cl_to_song)} songs with CL numbers")
|
||||
|
||||
# Step 4: Build image -> CL mapping, filtering out smasks and small images
|
||||
print("\n[4] Building image-to-song mapping...")
|
||||
skipped_smask = 0
|
||||
skipped_small = 0
|
||||
skipped_no_file = 0
|
||||
skipped_no_cl = 0
|
||||
skipped_no_song = 0
|
||||
|
||||
# Collect: CL number -> list of (image_num, file_path, file_size)
|
||||
cl_to_images = defaultdict(list)
|
||||
|
||||
for img_num, (page, img_type, size_str) in sorted(image_info.items()):
|
||||
# Skip smask (alpha channel masks)
|
||||
if img_type == "smask":
|
||||
skipped_smask += 1
|
||||
continue
|
||||
|
||||
# Check if image file exists
|
||||
img_path, ext = get_image_file(img_num)
|
||||
if img_path is None:
|
||||
skipped_no_file += 1
|
||||
continue
|
||||
|
||||
# Skip small images
|
||||
file_size = img_path.stat().st_size
|
||||
if file_size < MIN_FILE_SIZE:
|
||||
skipped_small += 1
|
||||
continue
|
||||
|
||||
# Map page -> CL number
|
||||
if page not in page_to_cl:
|
||||
skipped_no_cl += 1
|
||||
continue
|
||||
|
||||
cl_num = page_to_cl[page]
|
||||
|
||||
# Check if we have a song for this CL
|
||||
if cl_num not in cl_to_song:
|
||||
skipped_no_song += 1
|
||||
continue
|
||||
|
||||
cl_to_images[cl_num].append((img_num, img_path, file_size))
|
||||
|
||||
print(f" Skipped {skipped_smask} smask images")
|
||||
print(f" Skipped {skipped_small} images < {MIN_FILE_SIZE // 1024}KB")
|
||||
print(f" Skipped {skipped_no_file} images with no extracted file")
|
||||
print(f" Skipped {skipped_no_cl} images on pages without CL number")
|
||||
print(f" Skipped {skipped_no_song} images whose CL has no song file")
|
||||
print(f" Found images for {len(cl_to_images)} songs")
|
||||
|
||||
# Step 5: Insert images into songs
|
||||
print("\n[5] Inserting \\songimage commands...")
|
||||
inserted = 0
|
||||
skipped_existing = 0
|
||||
songs_with_multiple = 0
|
||||
|
||||
for cl_num in sorted(cl_to_images.keys()):
|
||||
candidates = cl_to_images[cl_num]
|
||||
song_path = cl_to_song[cl_num]
|
||||
|
||||
# Check if song already has an image
|
||||
content = song_path.read_text(encoding="utf-8")
|
||||
if song_already_has_image(content):
|
||||
skipped_existing += 1
|
||||
continue
|
||||
|
||||
# Pick the largest image by file size
|
||||
if len(candidates) > 1:
|
||||
songs_with_multiple += 1
|
||||
candidates.sort(key=lambda x: x[2], reverse=True)
|
||||
img_num, img_path, file_size = candidates[0]
|
||||
|
||||
# Determine the relative path for the \songimage command
|
||||
ext = img_path.suffix
|
||||
image_rel = f"images/img-{img_num:03d}{ext}"
|
||||
|
||||
# Insert into the song file
|
||||
if insert_songimage(song_path, image_rel):
|
||||
size_kb = file_size / 1024
|
||||
print(f" CL {cl_num:>3d}: {song_path.name} <- {image_rel} ({size_kb:.0f}KB)")
|
||||
inserted += 1
|
||||
else:
|
||||
skipped_existing += 1
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("Summary")
|
||||
print("=" * 60)
|
||||
print(f" Images in PDF: {len(image_info)}")
|
||||
print(f" Songs with CL numbers: {len(cl_to_song)}")
|
||||
print(f" Songs receiving images: {inserted}")
|
||||
print(f" Songs already had images: {skipped_existing}")
|
||||
print(f" Songs with multiple candidates: {songs_with_multiple} (picked largest)")
|
||||
print(f" Images not mapped to any song: {skipped_no_cl + skipped_no_song}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user