Files
songbook/import-songs.py
shahondin1624 7b99778f67 Fix chord alignment: snap to word boundaries
Improve merge_chord_lyric() to snap chord positions to the start
of the word they fall within, instead of splitting words mid-way.
Fixes artifacts like "Liebespaar \chord{C}e" → "\chord{C}Liebespaare".

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 16:36:06 +02:00

1067 lines
36 KiB
Python

#!/usr/bin/env python3
"""
Parse extracted text from Carmina Leonis songbook PDF and generate .tex song files
in leadsheets format.
Usage:
python3 import-songs.py [--input /tmp/songbook-full.txt] [--output-dir songs/]
"""
import re
import os
import sys
import argparse
from dataclasses import dataclass, field
from typing import Optional
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
INPUT_FILE = "/tmp/songbook-full.txt"
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "songs")
ALL_SONGS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "all-songs.tex")
# Existing hand-crafted songs that should NOT be overwritten
EXISTING_SONGS = {
"abend-wird-es-wieder.tex",
"auf-auf-zum-froehlichen-jagen.tex",
"die-gedanken-sind-frei.tex",
"hejo-spann-den-wagen-an.tex",
"kein-schoener-land.tex",
}
# Single chord pattern: a letter A-H (German uses H for B natural), optionally with
# sharp/flat, then optional quality suffix.
# This matches ONE chord.
SINGLE_CHORD_RE = re.compile(
r'[A-Ha-h][#b]?'
r'(?:m(?:aj|in)?|dim|aug|sus[24]?|add\d{1,2}|7|6|9|11|13|°|ø|\+|/[A-Ha-h][#b]?)*'
)
# BuLiBu footer header line pattern (song pages have BuLiBuII on the same line)
BULIBU_HEADER_RE = re.compile(r'BuLiBu\s+BuLiBuII\s+CL\s+SwA', re.IGNORECASE)
# Alternate footer pattern at the very end of the book
BULIBU_HEADER_ALT_RE = re.compile(r'BuLiBu\s+BuLiBuII\s+SwA', re.IGNORECASE)
# Metadata patterns
WORTE_UND_WEISE_RE = re.compile(r'^\s*Worte\s+und\s+Weise\s*:\s*(.+)', re.IGNORECASE)
WORTE_RE = re.compile(r'^\s*Worte\s*:\s*(.+)', re.IGNORECASE)
WEISE_RE = re.compile(r'^\s*Weise\s*:\s*(.+)', re.IGNORECASE)
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class SongReferences:
bulibu: Optional[str] = None
bulibull: Optional[str] = None
cl: Optional[str] = None
swa: Optional[str] = None
barde: Optional[str] = None
libock: Optional[str] = None
@dataclass
class Song:
title: str = ""
lyrics_author: Optional[str] = None # Worte
composer: Optional[str] = None # Weise
notes: list = field(default_factory=list) # Commentary paragraphs
raw_lines: list = field(default_factory=list) # All content lines (chord + lyric)
refs: SongReferences = field(default_factory=SongReferences)
# ---------------------------------------------------------------------------
# Chord detection helpers
# ---------------------------------------------------------------------------
def can_split_into_chords(token: str) -> bool:
"""Check if a token is either a single chord or a concatenation of chords.
Examples: 'G' -> True, 'De' -> True (D + e), 'FCdFCd' -> True,
'Feld' -> False, 'the' -> False
"""
token = token.strip("()")
if not token:
return False
# Try to match the entire token as a sequence of chords
pos = 0
while pos < len(token):
m = SINGLE_CHORD_RE.match(token, pos)
if m and m.end() > pos:
pos = m.end()
else:
return False
return True
def split_chord_token(token: str) -> list:
"""Split a potentially concatenated chord token into individual chords.
E.g., 'De' -> ['D', 'e'], 'FCdFCd' -> ['F', 'C', 'd', 'F', 'C', 'd']
"""
clean = token.strip("()")
chords = []
pos = 0
while pos < len(clean):
m = SINGLE_CHORD_RE.match(clean, pos)
if m and m.end() > pos:
chords.append(m.group())
pos = m.end()
else:
break
return chords
def is_chord_line(line: str) -> bool:
"""Return True if *line* consists entirely of chord tokens (with spacing).
Handles concatenated chord tokens from PDF text extraction (e.g., 'De', 'BF').
"""
stripped = line.strip()
if not stripped:
return False
tokens = stripped.split()
if not tokens:
return False
# Every token must be decomposable into valid chords
return all(can_split_into_chords(t) for t in tokens)
def is_bulibu_header(line: str) -> bool:
"""Return True if this line is a BuLiBu reference-footer header."""
clean = line.replace('\f', '')
return bool(BULIBU_HEADER_RE.search(clean) or BULIBU_HEADER_ALT_RE.search(clean))
def is_numbers_line(line: str) -> bool:
"""Return True if this line consists only of numbers separated by whitespace."""
stripped = line.strip()
if not stripped:
return False
tokens = stripped.split()
return all(re.match(r'^\d+$', t) for t in tokens)
def parse_reference_numbers(header_line: str, numbers_line: str) -> SongReferences:
"""Parse the BuLiBu header + numbers line into SongReferences.
The header columns are: BuLiBu BuLiBuII CL SwA Barde LiBock
We figure out column positions from the header, then extract numbers
from the corresponding positions in the numbers line.
"""
refs = SongReferences()
# Strip form feed characters that come from PDF page breaks
header_line = header_line.replace('\f', '')
numbers_line = numbers_line.replace('\f', '')
# Skip alternate format
if "SwA II" in header_line or "SwA II" in header_line:
return refs
col_names = ["BuLiBu", "BuLiBuII", "CL", "SwA", "Barde", "LiBock"]
positions = []
search_start = 0
for name in col_names:
idx = header_line.find(name, search_start)
if idx == -1:
lower_line = header_line.lower()
idx = lower_line.find(name.lower(), search_start)
if idx >= 0:
positions.append((name, idx))
search_start = idx + len(name)
else:
positions.append((name, -1))
attr_map = {
"BuLiBu": "bulibu",
"BuLiBuII": "bulibull",
"CL": "cl",
"SwA": "swa",
"Barde": "barde",
"LiBock": "libock",
}
valid_positions = [(name, pos) for name, pos in positions if pos >= 0]
for i, (name, pos) in enumerate(valid_positions):
if i + 1 < len(valid_positions):
end = valid_positions[i + 1][1]
else:
end = max(len(header_line), len(numbers_line))
segment = numbers_line[pos:end] if pos < len(numbers_line) else ""
segment = segment.strip()
if segment and re.match(r'^\d+$', segment):
attr = attr_map.get(name)
if attr:
setattr(refs, attr, segment)
return refs
# ---------------------------------------------------------------------------
# Filename and LaTeX helpers
# ---------------------------------------------------------------------------
def sanitize_filename(title: str) -> str:
"""Convert a song title to a sanitized filename (without .tex extension)."""
name = title.lower()
name = name.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss")
name = name.replace("Ä", "ae").replace("Ö", "oe").replace("Ü", "ue")
name = re.sub(r'\s+', '-', name)
name = re.sub(r"[^a-z0-9\-]", "", name)
name = re.sub(r'-+', '-', name)
name = name.strip('-')
if len(name) > 50:
name = name[:50].rstrip('-')
return name
def escape_latex(text: str) -> str:
"""Escape special LaTeX characters in text, preserving \\chord{} macros."""
text = text.replace("&", r"\&")
text = text.replace("%", r"\%")
text = text.replace("$", r"\$")
text = text.replace("#", r"\#")
text = text.replace("_", r"\_")
return text
# ---------------------------------------------------------------------------
# Chord-lyric merging
# ---------------------------------------------------------------------------
def parse_chords_with_positions(chord_line: str) -> list:
"""Parse a chord line and return list of (column_position, chord_name) tuples.
Handles concatenated chord tokens by splitting them and distributing
positions character by character.
"""
chords = []
i = 0
while i < len(chord_line):
if chord_line[i] != ' ':
j = i
while j < len(chord_line) and chord_line[j] != ' ':
j += 1
raw_token = chord_line[i:j]
clean_token = raw_token.strip("()")
if can_split_into_chords(clean_token):
split = split_chord_token(clean_token)
if len(split) == 1:
chords.append((i, split[0]))
else:
# Distribute concatenated chords: first chord at position i,
# subsequent chords get spaced after
pos = i
for chord in split:
chords.append((pos, chord))
pos += len(chord)
i = j
else:
i += 1
return chords
def merge_chord_lyric(chord_line: str, lyric_line: str) -> str:
"""Merge a chord line and lyric line by inserting \\chord{X} at positions.
Snaps chord positions to word boundaries: if a chord falls inside a word,
it is moved to the start of that word.
"""
chords = parse_chords_with_positions(chord_line)
if not chords:
return lyric_line.strip()
original_len = len(lyric_line)
# Only keep chords within lyric range
in_range = [(pos, ch) for pos, ch in chords if pos < original_len]
# Snap each chord position to the start of the word it falls in
snapped = []
for pos, chord in in_range:
# If position is inside a word (not at a space or start of word),
# scan backwards to find the word start
snap_pos = pos
if pos > 0 and pos < len(lyric_line) and lyric_line[pos] != ' ':
# Check if previous char is also non-space (mid-word)
if lyric_line[pos - 1] != ' ':
# Scan backwards to find word start
while snap_pos > 0 and lyric_line[snap_pos - 1] != ' ':
snap_pos -= 1
# If we're at a space, move forward to the next non-space
while snap_pos < len(lyric_line) and lyric_line[snap_pos] == ' ':
snap_pos += 1
snapped.append((snap_pos, chord))
# Deduplicate: if multiple chords snap to the same position, keep all
# but ensure we don't create \chord{A}\chord{B} without space
# Insert from right to left to maintain positions
result = lyric_line
prev_insert_pos = len(result) + 1
for pos, chord in reversed(snapped):
chord_macro = f"\\chord{{{chord}}}"
# Avoid inserting at same position as previous (from right)
if pos < prev_insert_pos:
result = result[:pos] + chord_macro + result[pos:]
prev_insert_pos = pos
result = result.strip()
# Collapse multiple spaces but preserve single spaces
result = re.sub(r' +', ' ', result)
# For adjacent chords without text between them (e.g. \chord{A}\chord{B}),
# add a space to prevent leadsheets tabular nesting issues
result = re.sub(r'\}(\\chord\{)', r'} \1', result)
return result
# ---------------------------------------------------------------------------
# Song content processing
# ---------------------------------------------------------------------------
def process_song_lines(raw_lines: list) -> list:
"""Process raw song content lines into verses with chord merging.
Returns a list of verses, where each verse is a list of merged lyric lines.
"""
lines = [l.replace('\f', '') for l in raw_lines]
verses = []
current_verse = []
i = 0
while i < len(lines):
line = lines[i]
stripped = line.strip()
if not stripped:
if current_verse:
verses.append(current_verse)
current_verse = []
i += 1
continue
if is_chord_line(stripped):
chord_line = line
if i + 1 < len(lines) and lines[i + 1].strip():
next_line = lines[i + 1]
next_stripped = next_line.strip()
if is_chord_line(next_stripped):
# Two consecutive chord lines - skip the first
i += 1
continue
else:
merged = merge_chord_lyric(chord_line, next_line)
current_verse.append(merged)
i += 2
continue
else:
# Chord line at end or before blank - skip
i += 1
continue
current_verse.append(stripped)
i += 1
if current_verse:
verses.append(current_verse)
return verses
# ---------------------------------------------------------------------------
# Song formatting
# ---------------------------------------------------------------------------
def escape_property_value(val: str) -> str:
"""Escape special LaTeX characters in song property values."""
val = val.replace("&", r"\&")
val = val.replace("%", r"\%")
val = val.replace("$", r"\$")
val = val.replace("#", r"\#")
val = val.replace("_", r"\_")
return val
def format_song_tex(song: Song) -> str:
"""Format a Song object as a .tex file string."""
props = []
title = escape_property_value(song.title)
props.append(f" title = {{{title}}},")
if song.lyrics_author:
val = escape_property_value(song.lyrics_author.strip())
props.append(f" lyrics = {{{val}}},")
if song.composer:
val = escape_property_value(song.composer.strip())
props.append(f" composer = {{{val}}},")
refs = song.refs
if refs.bulibu:
props.append(f" bulibu = {refs.bulibu},")
if refs.bulibull:
props.append(f" bulibull = {refs.bulibull},")
if refs.cl:
props.append(f" cl = {refs.cl},")
if refs.swa:
props.append(f" swa = {refs.swa},")
if refs.barde:
props.append(f" barde = {refs.barde},")
if refs.libock:
props.append(f" libock = {refs.libock},")
header = "\\begin{song}{\n" + "\n".join(props) + "\n}\n"
verses = process_song_lines(song.raw_lines)
body_parts = []
for verse in verses:
if not verse:
continue
verse_lines = []
for line in verse:
placeholders = []
def replace_chord(m, _ph=placeholders):
_ph.append(m.group(0))
return f"CHORDPLACEHOLDER{len(_ph)-1}ENDPLACEHOLDER"
escaped = re.sub(r'\\chord\{[^}]*\}', replace_chord, line)
escaped = escape_latex(escaped)
for idx, ph in enumerate(placeholders):
escaped = escaped.replace(f"CHORDPLACEHOLDER{idx}ENDPLACEHOLDER", ph)
verse_lines.append(escaped)
verse_text = " \\\\\n".join(verse_lines)
body_parts.append(f"\\begin{{verse}}\n{verse_text}\n\\end{{verse}}")
body = "\n\n".join(body_parts)
note_section = ""
if song.notes:
note_lines = "\n".join(song.notes).split("\n")
note_section = "\n" + "\n".join(f"% {l}" for l in note_lines) + "\n"
return header + note_section + "\n" + body + "\n\n\\end{song}\n"
# ---------------------------------------------------------------------------
# Main parsing logic
# ---------------------------------------------------------------------------
def read_input(path: str) -> list:
"""Read the input file and return lines."""
with open(path, 'r', encoding='utf-8') as f:
return f.readlines()
def find_song_start(lines: list) -> int:
"""Find the line index where song footers begin.
Song pages use 'BuLiBu BuLiBuII CL SwA Barde LiBock'.
TOC pages use a different layout with BuLiBu appearing separately.
"""
for i, line in enumerate(lines):
if is_bulibu_header(line):
return i
return 0
def has_metadata_before(lines: list, footer_idx: int, search_floor: int) -> bool:
"""Check if there's a Worte/Weise line between search_floor and footer_idx."""
search_start = max(search_floor, footer_idx - 40)
for j in range(search_start, footer_idx):
line = lines[j].strip()
if (WORTE_UND_WEISE_RE.match(line) or WORTE_RE.match(line) or
WEISE_RE.match(line)):
return True
return False
def is_song_end_footer(lines: list, footer_idx: int, search_floor: int) -> bool:
"""Determine if a BuLiBu footer marks the end of a song.
A footer ends a song if Worte/Weise metadata appears before it.
Some songs don't have Worte/Weise at all, which we handle in the
splitting logic.
"""
return has_metadata_before(lines, footer_idx, search_floor)
def looks_like_new_song_after(lines: list, after_start: int) -> bool:
"""Check if the content after a footer looks like a new song starts.
A new song typically has: a title line (short text) followed by a chord line.
But sometimes there's a "Ref.:" or "Ref:" label between title and chords.
A continuation has: lyrics continuing without a title+chord pattern.
"""
if after_start >= len(lines):
return False
peek_lines = []
for k in range(after_start, min(after_start + 12, len(lines))):
s = lines[k].replace('\f', '').strip()
if s and not is_bulibu_header(s):
peek_lines.append(s)
if len(peek_lines) >= 5:
break
if len(peek_lines) < 2:
return False
first = peek_lines[0]
# Check if first line is a potential title (short text, not chords, not numbers-only)
if is_chord_line(first) or len(first) >= 80:
return False
if re.match(r'^\d+(\s+\d+)*$', first.strip()):
return False
# Check if a chord line appears within the next few lines
for pl in peek_lines[1:4]:
if is_chord_line(pl):
return True
return False
def split_into_raw_songs(lines: list, start_idx: int) -> list:
"""Split the lines from start_idx onward into raw song blocks.
Strategy:
1. Find all BuLiBu footer positions.
2. For each footer, determine if it's a song boundary or mid-song page break.
3. A footer is a song boundary if:
a) It has Worte/Weise metadata before it AND the content after looks like
a new song (title + chords), OR
b) It has no metadata but the content after looks like a new song.
4. A footer is a mid-song page break if the content after does NOT look
like a new song (regardless of metadata placement).
"""
footer_positions = []
for i in range(start_idx, len(lines)):
if is_bulibu_header(lines[i]):
footer_positions.append(i)
if not footer_positions:
return []
song_blocks = []
current_block_start = None
current_block_lines = []
# Check for a song BEFORE the first footer (between TOC and first song footer).
# The first song in the book may appear before any BuLiBu footer.
first_footer = footer_positions[0]
# Search forward from ~100 lines before the first footer for a title + chord pattern
search_from = max(0, first_footer - 100)
pre_footer_song_start = None
for i in range(search_from, first_footer - 1):
line = lines[i].replace('\f', '').strip()
next_line = lines[i + 1].replace('\f', '').strip() if i + 1 < first_footer else ''
# Look for: short text line followed by a chord line
if (line and not is_chord_line(line) and len(line) < 80 and
next_line and is_chord_line(next_line)):
pre_footer_song_start = i
break
if pre_footer_song_start is not None:
pre_block = lines[pre_footer_song_start:first_footer]
refs = parse_reference_numbers(
lines[first_footer],
lines[first_footer + 1] if first_footer + 1 < len(lines) else ""
)
song_blocks.append({
'lines': pre_block,
'refs': refs,
})
for fi, footer_idx in enumerate(footer_positions):
numbers_idx = footer_idx + 1
if numbers_idx >= len(lines):
break
# For the first footer, it's the transition from TOC to songs
if fi == 0:
current_block_start = numbers_idx + 1
continue
# Determine search floor (don't search before previous footer's numbers line)
prev_footer = footer_positions[fi - 1]
search_floor = prev_footer + 2 # Skip header + numbers line
# Collect lines from current_block_start to this footer
if current_block_start is not None:
block_lines = lines[current_block_start:footer_idx]
has_metadata = has_metadata_before(lines, footer_idx, search_floor)
after_start = numbers_idx + 1
new_song_follows = looks_like_new_song_after(lines, after_start)
# Decision: is this a song boundary?
# - If metadata is present AND new song follows: song boundary
# - If no metadata AND new song follows: song boundary (song without metadata)
# - If metadata is present but NO new song follows: mid-song page break
# (metadata appeared before the last page of the song)
# - If no metadata and no new song follows: mid-song page break
is_boundary = new_song_follows or (has_metadata and fi == len(footer_positions) - 1)
# Special case: if this is the last footer, it always ends the song
if fi == len(footer_positions) - 1:
is_boundary = True
if is_boundary:
current_block_lines.extend(block_lines)
refs = parse_reference_numbers(
lines[footer_idx],
lines[numbers_idx] if numbers_idx < len(lines) else ""
)
song_blocks.append({
'lines': current_block_lines,
'refs': refs,
})
current_block_lines = []
else:
# Mid-song page break - accumulate
current_block_lines.extend(block_lines)
current_block_start = numbers_idx + 1
# Handle remaining content
if current_block_start and current_block_start < len(lines):
remaining = lines[current_block_start:]
current_block_lines.extend(remaining)
if current_block_lines:
song_blocks.append({
'lines': current_block_lines,
'refs': SongReferences(),
})
return song_blocks
def extract_metadata_and_notes(block_lines: list):
"""Extract Worte, Weise, and commentary notes from the end of block lines.
Returns (content_lines, lyrics_author, composer, notes).
"""
lyrics_author = None
composer = None
notes_lines = []
# Find Worte/Weise lines working backwards
worte_idx = None
weise_idx = None
worte_und_weise_idx = None
metadata_start = len(block_lines)
for i in range(len(block_lines) - 1, max(-1, len(block_lines) - 50), -1):
line = block_lines[i].strip()
if not line:
continue
m = WORTE_UND_WEISE_RE.match(line)
if m:
worte_und_weise_idx = i
val = m.group(1).strip()
# Collect continuation lines
j = i + 1
while j < len(block_lines):
nl = block_lines[j].strip()
if not nl or WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl):
break
val += " " + nl
j += 1
lyrics_author = val
composer = val
metadata_start = i
break
mw = WORTE_RE.match(line)
if mw and not WORTE_UND_WEISE_RE.match(line):
if worte_idx is None:
worte_idx = i
val = mw.group(1).strip()
j = i + 1
while j < len(block_lines):
nl = block_lines[j].strip()
if not nl or WEISE_RE.match(nl) or WORTE_RE.match(nl) or is_bulibu_header(nl):
break
val += " " + nl
j += 1
lyrics_author = val
if weise_idx is not None:
metadata_start = min(worte_idx, weise_idx)
break
else:
metadata_start = worte_idx
continue
mws = WEISE_RE.match(line)
if mws:
if weise_idx is None:
weise_idx = i
val = mws.group(1).strip()
j = i + 1
while j < len(block_lines):
nl = block_lines[j].strip()
if not nl or WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl):
break
val += " " + nl
j += 1
composer = val
if worte_idx is not None:
metadata_start = min(worte_idx, weise_idx)
break
else:
metadata_start = weise_idx
continue
# Determine final metadata_start and metadata_end
metadata_end = metadata_start
if worte_und_weise_idx is not None:
metadata_start = worte_und_weise_idx
# Find end of the Worte und Weise block (including continuation lines)
j = worte_und_weise_idx + 1
while j < len(block_lines):
nl = block_lines[j].strip()
if not nl:
break
if WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl):
break
j += 1
metadata_end = j
elif worte_idx is not None and weise_idx is not None:
metadata_start = min(worte_idx, weise_idx)
metadata_end = max(worte_idx, weise_idx) + 1
# Include continuation lines after the later one
later_idx = max(worte_idx, weise_idx)
j = later_idx + 1
while j < len(block_lines):
nl = block_lines[j].strip()
if not nl:
break
if WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl):
break
j += 1
metadata_end = j
elif worte_idx is not None:
metadata_start = worte_idx
j = worte_idx + 1
while j < len(block_lines):
nl = block_lines[j].strip()
if not nl:
break
j += 1
metadata_end = j
elif weise_idx is not None:
metadata_start = weise_idx
j = weise_idx + 1
while j < len(block_lines):
nl = block_lines[j].strip()
if not nl:
break
j += 1
metadata_end = j
# Content is everything EXCEPT the metadata lines.
# If there's content AFTER the metadata, include it too.
content_before = block_lines[:metadata_start]
content_after = block_lines[metadata_end:]
# Check if content_after has actual song content (not just whitespace)
has_after_content = any(l.strip() for l in content_after)
if has_after_content:
content_lines = content_before + content_after
else:
content_lines = content_before
# Look for commentary/notes between song content and metadata
# Notes are prose paragraphs separated by blank lines from song content
# Heuristic: look for a blank line gap, then check if text after it is prose
last_content = len(content_lines) - 1
while last_content >= 0 and not content_lines[last_content].strip():
last_content -= 1
if last_content >= 0:
# Search backwards for a blank-line boundary
for j in range(last_content, 0, -1):
if not content_lines[j].strip():
# Check if the block after this blank line is commentary
candidate = [l.strip() for l in content_lines[j+1:last_content+1] if l.strip()]
if candidate and len(candidate) >= 1:
avg_len = sum(len(l) for l in candidate) / len(candidate)
has_period = any('.' in l for l in candidate)
if avg_len > 60 and has_period:
notes_lines = candidate
content_lines = content_lines[:j]
break
return content_lines, lyrics_author, composer, notes_lines
def extract_title(content_lines: list) -> tuple:
"""Extract the song title from the beginning of content lines.
The title appears on line(s) before the first chord line.
Often the title is the same text as (or a prefix of) the first lyric line.
Returns (title, remaining_lines).
"""
# Skip empty lines and form feeds at the start
start = 0
while start < len(content_lines):
line = content_lines[start].replace('\f', '').strip()
if line:
break
start += 1
if start >= len(content_lines):
return "", content_lines
# Verse label patterns that should NOT be part of the title
VERSE_LABEL_RE = re.compile(r'^(Ref\.?:|Refrain:?|Refr\.?:)', re.IGNORECASE)
# Strategy: collect title lines until we hit a chord line or verse label.
# The line BEFORE the first chord line is the last title line.
title_lines = []
for i in range(start, len(content_lines)):
line = content_lines[i].replace('\f', '').strip()
if not line:
if title_lines:
remaining = content_lines[i:]
return " ".join(title_lines), remaining
continue
if is_chord_line(line):
if title_lines:
remaining = content_lines[i:]
return " ".join(title_lines), remaining
else:
return "", content_lines[start:]
# If this line is a verse label (Ref.:, etc.), stop title collection
if VERSE_LABEL_RE.match(line):
if title_lines:
remaining = content_lines[i:]
return " ".join(title_lines), remaining
# Verse label as first line - unusual; skip it
continue
# Check if the next non-empty, non-label line is a chord line
next_chord = False
for j in range(i + 1, min(i + 4, len(content_lines))):
nxt = content_lines[j].replace('\f', '').strip()
if nxt:
if is_chord_line(nxt):
next_chord = True
elif VERSE_LABEL_RE.match(nxt):
continue # Skip verse labels in lookahead
break
if next_chord and not title_lines:
title_lines.append(line)
remaining = content_lines[i + 1:]
return " ".join(title_lines), remaining
elif next_chord and title_lines:
title_lines.append(line)
remaining = content_lines[i + 1:]
return " ".join(title_lines), remaining
else:
if not title_lines:
title_lines.append(line)
else:
remaining = content_lines[i:]
return " ".join(title_lines), remaining
# Fell through - use whatever we collected
if title_lines:
return " ".join(title_lines), []
return "", content_lines[start:]
def parse_song_block(block: dict) -> Optional[Song]:
"""Parse a raw song block into a Song object."""
block_lines = block['lines']
refs = block['refs']
cleaned_lines = [l.rstrip('\n').replace('\f', '') for l in block_lines]
while cleaned_lines and not cleaned_lines[0].strip():
cleaned_lines.pop(0)
while cleaned_lines and not cleaned_lines[-1].strip():
cleaned_lines.pop()
if not cleaned_lines:
return None
content_lines, lyrics_author, composer, notes = extract_metadata_and_notes(cleaned_lines)
if not content_lines:
# Song might be metadata-only (e.g., Tischlieder, Trinksprüche)
return None
while content_lines and not content_lines[-1].strip():
content_lines.pop()
while content_lines and not content_lines[0].strip():
content_lines.pop(0)
title, song_lines = extract_title(content_lines)
if not title:
# Try to derive title from first lyric line
# (Some songs start directly with chords)
for l in song_lines:
s = l.replace('\f', '').strip()
if s and not is_chord_line(s):
# Use first few words as title
title = s
if len(title) > 60:
title = title[:60].rsplit(' ', 1)[0]
break
if not title:
return None
title = re.sub(r'\s+', ' ', title).strip()
# Filter out non-song blocks
# - Title is just a number (page number artifact)
# - Title mentions "Illustration" (illustration page)
if re.match(r'^\d+$', title):
return None
if 'Illustration' in title:
return None
song = Song(
title=title,
lyrics_author=lyrics_author,
composer=composer,
notes=notes,
raw_lines=song_lines,
refs=refs,
)
return song
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(description="Import songs from Carmina Leonis text extract")
parser.add_argument("--input", default=INPUT_FILE, help="Path to extracted text file")
parser.add_argument("--output-dir", default=OUTPUT_DIR, help="Output directory for .tex files")
parser.add_argument("--all-songs", default=ALL_SONGS_FILE, help="Path for all-songs.tex")
parser.add_argument("--dry-run", action="store_true", help="Parse but don't write files")
args = parser.parse_args()
print(f"Reading input from: {args.input}")
lines = read_input(args.input)
print(f"Read {len(lines)} lines")
song_start = find_song_start(lines)
print(f"Songs start at line {song_start + 1}")
raw_blocks = split_into_raw_songs(lines, song_start)
print(f"Found {len(raw_blocks)} raw song blocks")
songs = []
skipped = 0
for block in raw_blocks:
song = parse_song_block(block)
if song:
songs.append(song)
else:
skipped += 1
print(f"Parsed {len(songs)} songs ({skipped} blocks skipped)")
if args.dry_run:
for s in songs:
cl = s.refs.cl or "?"
print(f" CL {cl:>4s}: {s.title}")
return
os.makedirs(args.output_dir, exist_ok=True)
written = 0
skipped_existing = 0
filenames = []
used_filenames = set()
for song in songs:
filename = sanitize_filename(song.title)
if not filename:
print(f" WARNING: Could not generate filename for '{song.title}', skipping")
continue
base_filename = filename
counter = 2
while filename + ".tex" in used_filenames:
filename = f"{base_filename}-{counter}"
counter += 1
used_filenames.add(filename + ".tex")
tex_filename = filename + ".tex"
filepath = os.path.join(args.output_dir, tex_filename)
if tex_filename in EXISTING_SONGS:
print(f" SKIP (existing): {tex_filename}")
skipped_existing += 1
filenames.append((filename, song.title))
continue
tex_content = format_song_tex(song)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(tex_content)
filenames.append((filename, song.title))
written += 1
print(f"\nWrote {written} song files ({skipped_existing} existing songs preserved)")
filenames.sort(key=lambda x: x[1].lower())
with open(args.all_songs, 'w', encoding='utf-8') as f:
f.write("% Auto-generated list of all songs (alphabetical order)\n")
f.write("% Generated by import-songs.py\n\n")
for fname, title in filenames:
f.write(f"\\input{{songs/{fname}}}\n")
print(f"Generated {args.all_songs} with {len(filenames)} entries")
with_chords = sum(1 for s in songs if any(is_chord_line(l.strip()) for l in s.raw_lines if l.strip()))
with_worte = sum(1 for s in songs if s.lyrics_author)
with_weise = sum(1 for s in songs if s.composer)
with_cl = sum(1 for s in songs if s.refs.cl)
print(f"\nStatistics:")
print(f" Total songs: {len(songs)}")
print(f" With chords: {with_chords}")
print(f" With lyrics author (Worte): {with_worte}")
print(f" With composer (Weise): {with_weise}")
print(f" With CL number: {with_cl}")
if __name__ == "__main__":
main()