Improve merge_chord_lyric() to snap chord positions to the start
of the word they fall within, instead of splitting words mid-way.
Fixes artifacts like "Liebespaar \chord{C}e" → "\chord{C}Liebespaare".
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1067 lines
36 KiB
Python
1067 lines
36 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Parse extracted text from Carmina Leonis songbook PDF and generate .tex song files
|
|
in leadsheets format.
|
|
|
|
Usage:
|
|
python3 import-songs.py [--input /tmp/songbook-full.txt] [--output-dir songs/]
|
|
"""
|
|
|
|
import re
|
|
import os
|
|
import sys
|
|
import argparse
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration
|
|
# ---------------------------------------------------------------------------
|
|
|
|
INPUT_FILE = "/tmp/songbook-full.txt"
|
|
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "songs")
|
|
ALL_SONGS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "all-songs.tex")
|
|
|
|
# Existing hand-crafted songs that should NOT be overwritten
|
|
EXISTING_SONGS = {
|
|
"abend-wird-es-wieder.tex",
|
|
"auf-auf-zum-froehlichen-jagen.tex",
|
|
"die-gedanken-sind-frei.tex",
|
|
"hejo-spann-den-wagen-an.tex",
|
|
"kein-schoener-land.tex",
|
|
}
|
|
|
|
# Single chord pattern: a letter A-H (German uses H for B natural), optionally with
|
|
# sharp/flat, then optional quality suffix.
|
|
# This matches ONE chord.
|
|
SINGLE_CHORD_RE = re.compile(
|
|
r'[A-Ha-h][#b]?'
|
|
r'(?:m(?:aj|in)?|dim|aug|sus[24]?|add\d{1,2}|7|6|9|11|13|°|ø|\+|/[A-Ha-h][#b]?)*'
|
|
)
|
|
|
|
# BuLiBu footer header line pattern (song pages have BuLiBuII on the same line)
|
|
BULIBU_HEADER_RE = re.compile(r'BuLiBu\s+BuLiBuII\s+CL\s+SwA', re.IGNORECASE)
|
|
|
|
# Alternate footer pattern at the very end of the book
|
|
BULIBU_HEADER_ALT_RE = re.compile(r'BuLiBu\s+BuLiBuII\s+SwA', re.IGNORECASE)
|
|
|
|
# Metadata patterns
|
|
WORTE_UND_WEISE_RE = re.compile(r'^\s*Worte\s+und\s+Weise\s*:\s*(.+)', re.IGNORECASE)
|
|
WORTE_RE = re.compile(r'^\s*Worte\s*:\s*(.+)', re.IGNORECASE)
|
|
WEISE_RE = re.compile(r'^\s*Weise\s*:\s*(.+)', re.IGNORECASE)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data classes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@dataclass
|
|
class SongReferences:
|
|
bulibu: Optional[str] = None
|
|
bulibull: Optional[str] = None
|
|
cl: Optional[str] = None
|
|
swa: Optional[str] = None
|
|
barde: Optional[str] = None
|
|
libock: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class Song:
|
|
title: str = ""
|
|
lyrics_author: Optional[str] = None # Worte
|
|
composer: Optional[str] = None # Weise
|
|
notes: list = field(default_factory=list) # Commentary paragraphs
|
|
raw_lines: list = field(default_factory=list) # All content lines (chord + lyric)
|
|
refs: SongReferences = field(default_factory=SongReferences)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Chord detection helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def can_split_into_chords(token: str) -> bool:
|
|
"""Check if a token is either a single chord or a concatenation of chords.
|
|
|
|
Examples: 'G' -> True, 'De' -> True (D + e), 'FCdFCd' -> True,
|
|
'Feld' -> False, 'the' -> False
|
|
"""
|
|
token = token.strip("()")
|
|
if not token:
|
|
return False
|
|
# Try to match the entire token as a sequence of chords
|
|
pos = 0
|
|
while pos < len(token):
|
|
m = SINGLE_CHORD_RE.match(token, pos)
|
|
if m and m.end() > pos:
|
|
pos = m.end()
|
|
else:
|
|
return False
|
|
return True
|
|
|
|
|
|
def split_chord_token(token: str) -> list:
|
|
"""Split a potentially concatenated chord token into individual chords.
|
|
|
|
E.g., 'De' -> ['D', 'e'], 'FCdFCd' -> ['F', 'C', 'd', 'F', 'C', 'd']
|
|
"""
|
|
clean = token.strip("()")
|
|
chords = []
|
|
pos = 0
|
|
while pos < len(clean):
|
|
m = SINGLE_CHORD_RE.match(clean, pos)
|
|
if m and m.end() > pos:
|
|
chords.append(m.group())
|
|
pos = m.end()
|
|
else:
|
|
break
|
|
return chords
|
|
|
|
|
|
def is_chord_line(line: str) -> bool:
|
|
"""Return True if *line* consists entirely of chord tokens (with spacing).
|
|
|
|
Handles concatenated chord tokens from PDF text extraction (e.g., 'De', 'BF').
|
|
"""
|
|
stripped = line.strip()
|
|
if not stripped:
|
|
return False
|
|
tokens = stripped.split()
|
|
if not tokens:
|
|
return False
|
|
# Every token must be decomposable into valid chords
|
|
return all(can_split_into_chords(t) for t in tokens)
|
|
|
|
|
|
def is_bulibu_header(line: str) -> bool:
|
|
"""Return True if this line is a BuLiBu reference-footer header."""
|
|
clean = line.replace('\f', '')
|
|
return bool(BULIBU_HEADER_RE.search(clean) or BULIBU_HEADER_ALT_RE.search(clean))
|
|
|
|
|
|
def is_numbers_line(line: str) -> bool:
|
|
"""Return True if this line consists only of numbers separated by whitespace."""
|
|
stripped = line.strip()
|
|
if not stripped:
|
|
return False
|
|
tokens = stripped.split()
|
|
return all(re.match(r'^\d+$', t) for t in tokens)
|
|
|
|
|
|
def parse_reference_numbers(header_line: str, numbers_line: str) -> SongReferences:
|
|
"""Parse the BuLiBu header + numbers line into SongReferences.
|
|
|
|
The header columns are: BuLiBu BuLiBuII CL SwA Barde LiBock
|
|
We figure out column positions from the header, then extract numbers
|
|
from the corresponding positions in the numbers line.
|
|
"""
|
|
refs = SongReferences()
|
|
|
|
# Strip form feed characters that come from PDF page breaks
|
|
header_line = header_line.replace('\f', '')
|
|
numbers_line = numbers_line.replace('\f', '')
|
|
|
|
# Skip alternate format
|
|
if "SwA II" in header_line or "SwA II" in header_line:
|
|
return refs
|
|
|
|
col_names = ["BuLiBu", "BuLiBuII", "CL", "SwA", "Barde", "LiBock"]
|
|
|
|
positions = []
|
|
search_start = 0
|
|
for name in col_names:
|
|
idx = header_line.find(name, search_start)
|
|
if idx == -1:
|
|
lower_line = header_line.lower()
|
|
idx = lower_line.find(name.lower(), search_start)
|
|
if idx >= 0:
|
|
positions.append((name, idx))
|
|
search_start = idx + len(name)
|
|
else:
|
|
positions.append((name, -1))
|
|
|
|
attr_map = {
|
|
"BuLiBu": "bulibu",
|
|
"BuLiBuII": "bulibull",
|
|
"CL": "cl",
|
|
"SwA": "swa",
|
|
"Barde": "barde",
|
|
"LiBock": "libock",
|
|
}
|
|
|
|
valid_positions = [(name, pos) for name, pos in positions if pos >= 0]
|
|
for i, (name, pos) in enumerate(valid_positions):
|
|
if i + 1 < len(valid_positions):
|
|
end = valid_positions[i + 1][1]
|
|
else:
|
|
end = max(len(header_line), len(numbers_line))
|
|
|
|
segment = numbers_line[pos:end] if pos < len(numbers_line) else ""
|
|
segment = segment.strip()
|
|
|
|
if segment and re.match(r'^\d+$', segment):
|
|
attr = attr_map.get(name)
|
|
if attr:
|
|
setattr(refs, attr, segment)
|
|
|
|
return refs
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Filename and LaTeX helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def sanitize_filename(title: str) -> str:
|
|
"""Convert a song title to a sanitized filename (without .tex extension)."""
|
|
name = title.lower()
|
|
name = name.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss")
|
|
name = name.replace("Ä", "ae").replace("Ö", "oe").replace("Ü", "ue")
|
|
name = re.sub(r'\s+', '-', name)
|
|
name = re.sub(r"[^a-z0-9\-]", "", name)
|
|
name = re.sub(r'-+', '-', name)
|
|
name = name.strip('-')
|
|
if len(name) > 50:
|
|
name = name[:50].rstrip('-')
|
|
return name
|
|
|
|
|
|
def escape_latex(text: str) -> str:
|
|
"""Escape special LaTeX characters in text, preserving \\chord{} macros."""
|
|
text = text.replace("&", r"\&")
|
|
text = text.replace("%", r"\%")
|
|
text = text.replace("$", r"\$")
|
|
text = text.replace("#", r"\#")
|
|
text = text.replace("_", r"\_")
|
|
return text
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Chord-lyric merging
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def parse_chords_with_positions(chord_line: str) -> list:
|
|
"""Parse a chord line and return list of (column_position, chord_name) tuples.
|
|
|
|
Handles concatenated chord tokens by splitting them and distributing
|
|
positions character by character.
|
|
"""
|
|
chords = []
|
|
i = 0
|
|
while i < len(chord_line):
|
|
if chord_line[i] != ' ':
|
|
j = i
|
|
while j < len(chord_line) and chord_line[j] != ' ':
|
|
j += 1
|
|
raw_token = chord_line[i:j]
|
|
clean_token = raw_token.strip("()")
|
|
|
|
if can_split_into_chords(clean_token):
|
|
split = split_chord_token(clean_token)
|
|
if len(split) == 1:
|
|
chords.append((i, split[0]))
|
|
else:
|
|
# Distribute concatenated chords: first chord at position i,
|
|
# subsequent chords get spaced after
|
|
pos = i
|
|
for chord in split:
|
|
chords.append((pos, chord))
|
|
pos += len(chord)
|
|
i = j
|
|
else:
|
|
i += 1
|
|
return chords
|
|
|
|
|
|
def merge_chord_lyric(chord_line: str, lyric_line: str) -> str:
|
|
"""Merge a chord line and lyric line by inserting \\chord{X} at positions.
|
|
|
|
Snaps chord positions to word boundaries: if a chord falls inside a word,
|
|
it is moved to the start of that word.
|
|
"""
|
|
chords = parse_chords_with_positions(chord_line)
|
|
|
|
if not chords:
|
|
return lyric_line.strip()
|
|
|
|
original_len = len(lyric_line)
|
|
|
|
# Only keep chords within lyric range
|
|
in_range = [(pos, ch) for pos, ch in chords if pos < original_len]
|
|
|
|
# Snap each chord position to the start of the word it falls in
|
|
snapped = []
|
|
for pos, chord in in_range:
|
|
# If position is inside a word (not at a space or start of word),
|
|
# scan backwards to find the word start
|
|
snap_pos = pos
|
|
if pos > 0 and pos < len(lyric_line) and lyric_line[pos] != ' ':
|
|
# Check if previous char is also non-space (mid-word)
|
|
if lyric_line[pos - 1] != ' ':
|
|
# Scan backwards to find word start
|
|
while snap_pos > 0 and lyric_line[snap_pos - 1] != ' ':
|
|
snap_pos -= 1
|
|
# If we're at a space, move forward to the next non-space
|
|
while snap_pos < len(lyric_line) and lyric_line[snap_pos] == ' ':
|
|
snap_pos += 1
|
|
snapped.append((snap_pos, chord))
|
|
|
|
# Deduplicate: if multiple chords snap to the same position, keep all
|
|
# but ensure we don't create \chord{A}\chord{B} without space
|
|
# Insert from right to left to maintain positions
|
|
result = lyric_line
|
|
prev_insert_pos = len(result) + 1
|
|
for pos, chord in reversed(snapped):
|
|
chord_macro = f"\\chord{{{chord}}}"
|
|
# Avoid inserting at same position as previous (from right)
|
|
if pos < prev_insert_pos:
|
|
result = result[:pos] + chord_macro + result[pos:]
|
|
prev_insert_pos = pos
|
|
|
|
result = result.strip()
|
|
# Collapse multiple spaces but preserve single spaces
|
|
result = re.sub(r' +', ' ', result)
|
|
|
|
# For adjacent chords without text between them (e.g. \chord{A}\chord{B}),
|
|
# add a space to prevent leadsheets tabular nesting issues
|
|
result = re.sub(r'\}(\\chord\{)', r'} \1', result)
|
|
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Song content processing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def process_song_lines(raw_lines: list) -> list:
|
|
"""Process raw song content lines into verses with chord merging.
|
|
|
|
Returns a list of verses, where each verse is a list of merged lyric lines.
|
|
"""
|
|
lines = [l.replace('\f', '') for l in raw_lines]
|
|
verses = []
|
|
current_verse = []
|
|
i = 0
|
|
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
stripped = line.strip()
|
|
|
|
if not stripped:
|
|
if current_verse:
|
|
verses.append(current_verse)
|
|
current_verse = []
|
|
i += 1
|
|
continue
|
|
|
|
if is_chord_line(stripped):
|
|
chord_line = line
|
|
if i + 1 < len(lines) and lines[i + 1].strip():
|
|
next_line = lines[i + 1]
|
|
next_stripped = next_line.strip()
|
|
if is_chord_line(next_stripped):
|
|
# Two consecutive chord lines - skip the first
|
|
i += 1
|
|
continue
|
|
else:
|
|
merged = merge_chord_lyric(chord_line, next_line)
|
|
current_verse.append(merged)
|
|
i += 2
|
|
continue
|
|
else:
|
|
# Chord line at end or before blank - skip
|
|
i += 1
|
|
continue
|
|
|
|
current_verse.append(stripped)
|
|
i += 1
|
|
|
|
if current_verse:
|
|
verses.append(current_verse)
|
|
|
|
return verses
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Song formatting
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def escape_property_value(val: str) -> str:
|
|
"""Escape special LaTeX characters in song property values."""
|
|
val = val.replace("&", r"\&")
|
|
val = val.replace("%", r"\%")
|
|
val = val.replace("$", r"\$")
|
|
val = val.replace("#", r"\#")
|
|
val = val.replace("_", r"\_")
|
|
return val
|
|
|
|
|
|
def format_song_tex(song: Song) -> str:
|
|
"""Format a Song object as a .tex file string."""
|
|
props = []
|
|
title = escape_property_value(song.title)
|
|
props.append(f" title = {{{title}}},")
|
|
|
|
if song.lyrics_author:
|
|
val = escape_property_value(song.lyrics_author.strip())
|
|
props.append(f" lyrics = {{{val}}},")
|
|
|
|
if song.composer:
|
|
val = escape_property_value(song.composer.strip())
|
|
props.append(f" composer = {{{val}}},")
|
|
|
|
refs = song.refs
|
|
if refs.bulibu:
|
|
props.append(f" bulibu = {refs.bulibu},")
|
|
if refs.bulibull:
|
|
props.append(f" bulibull = {refs.bulibull},")
|
|
if refs.cl:
|
|
props.append(f" cl = {refs.cl},")
|
|
if refs.swa:
|
|
props.append(f" swa = {refs.swa},")
|
|
if refs.barde:
|
|
props.append(f" barde = {refs.barde},")
|
|
if refs.libock:
|
|
props.append(f" libock = {refs.libock},")
|
|
|
|
header = "\\begin{song}{\n" + "\n".join(props) + "\n}\n"
|
|
|
|
verses = process_song_lines(song.raw_lines)
|
|
|
|
body_parts = []
|
|
for verse in verses:
|
|
if not verse:
|
|
continue
|
|
|
|
verse_lines = []
|
|
for line in verse:
|
|
placeholders = []
|
|
def replace_chord(m, _ph=placeholders):
|
|
_ph.append(m.group(0))
|
|
return f"CHORDPLACEHOLDER{len(_ph)-1}ENDPLACEHOLDER"
|
|
|
|
escaped = re.sub(r'\\chord\{[^}]*\}', replace_chord, line)
|
|
escaped = escape_latex(escaped)
|
|
for idx, ph in enumerate(placeholders):
|
|
escaped = escaped.replace(f"CHORDPLACEHOLDER{idx}ENDPLACEHOLDER", ph)
|
|
|
|
verse_lines.append(escaped)
|
|
|
|
verse_text = " \\\\\n".join(verse_lines)
|
|
body_parts.append(f"\\begin{{verse}}\n{verse_text}\n\\end{{verse}}")
|
|
|
|
body = "\n\n".join(body_parts)
|
|
|
|
note_section = ""
|
|
if song.notes:
|
|
note_lines = "\n".join(song.notes).split("\n")
|
|
note_section = "\n" + "\n".join(f"% {l}" for l in note_lines) + "\n"
|
|
|
|
return header + note_section + "\n" + body + "\n\n\\end{song}\n"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main parsing logic
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def read_input(path: str) -> list:
|
|
"""Read the input file and return lines."""
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
return f.readlines()
|
|
|
|
|
|
def find_song_start(lines: list) -> int:
|
|
"""Find the line index where song footers begin.
|
|
|
|
Song pages use 'BuLiBu BuLiBuII CL SwA Barde LiBock'.
|
|
TOC pages use a different layout with BuLiBu appearing separately.
|
|
"""
|
|
for i, line in enumerate(lines):
|
|
if is_bulibu_header(line):
|
|
return i
|
|
return 0
|
|
|
|
|
|
def has_metadata_before(lines: list, footer_idx: int, search_floor: int) -> bool:
|
|
"""Check if there's a Worte/Weise line between search_floor and footer_idx."""
|
|
search_start = max(search_floor, footer_idx - 40)
|
|
for j in range(search_start, footer_idx):
|
|
line = lines[j].strip()
|
|
if (WORTE_UND_WEISE_RE.match(line) or WORTE_RE.match(line) or
|
|
WEISE_RE.match(line)):
|
|
return True
|
|
return False
|
|
|
|
|
|
def is_song_end_footer(lines: list, footer_idx: int, search_floor: int) -> bool:
|
|
"""Determine if a BuLiBu footer marks the end of a song.
|
|
|
|
A footer ends a song if Worte/Weise metadata appears before it.
|
|
Some songs don't have Worte/Weise at all, which we handle in the
|
|
splitting logic.
|
|
"""
|
|
return has_metadata_before(lines, footer_idx, search_floor)
|
|
|
|
|
|
def looks_like_new_song_after(lines: list, after_start: int) -> bool:
|
|
"""Check if the content after a footer looks like a new song starts.
|
|
|
|
A new song typically has: a title line (short text) followed by a chord line.
|
|
But sometimes there's a "Ref.:" or "Ref:" label between title and chords.
|
|
A continuation has: lyrics continuing without a title+chord pattern.
|
|
"""
|
|
if after_start >= len(lines):
|
|
return False
|
|
|
|
peek_lines = []
|
|
for k in range(after_start, min(after_start + 12, len(lines))):
|
|
s = lines[k].replace('\f', '').strip()
|
|
if s and not is_bulibu_header(s):
|
|
peek_lines.append(s)
|
|
if len(peek_lines) >= 5:
|
|
break
|
|
|
|
if len(peek_lines) < 2:
|
|
return False
|
|
|
|
first = peek_lines[0]
|
|
|
|
# Check if first line is a potential title (short text, not chords, not numbers-only)
|
|
if is_chord_line(first) or len(first) >= 80:
|
|
return False
|
|
if re.match(r'^\d+(\s+\d+)*$', first.strip()):
|
|
return False
|
|
|
|
# Check if a chord line appears within the next few lines
|
|
for pl in peek_lines[1:4]:
|
|
if is_chord_line(pl):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def split_into_raw_songs(lines: list, start_idx: int) -> list:
|
|
"""Split the lines from start_idx onward into raw song blocks.
|
|
|
|
Strategy:
|
|
1. Find all BuLiBu footer positions.
|
|
2. For each footer, determine if it's a song boundary or mid-song page break.
|
|
3. A footer is a song boundary if:
|
|
a) It has Worte/Weise metadata before it AND the content after looks like
|
|
a new song (title + chords), OR
|
|
b) It has no metadata but the content after looks like a new song.
|
|
4. A footer is a mid-song page break if the content after does NOT look
|
|
like a new song (regardless of metadata placement).
|
|
"""
|
|
footer_positions = []
|
|
for i in range(start_idx, len(lines)):
|
|
if is_bulibu_header(lines[i]):
|
|
footer_positions.append(i)
|
|
|
|
if not footer_positions:
|
|
return []
|
|
|
|
song_blocks = []
|
|
current_block_start = None
|
|
current_block_lines = []
|
|
|
|
# Check for a song BEFORE the first footer (between TOC and first song footer).
|
|
# The first song in the book may appear before any BuLiBu footer.
|
|
first_footer = footer_positions[0]
|
|
# Search forward from ~100 lines before the first footer for a title + chord pattern
|
|
search_from = max(0, first_footer - 100)
|
|
pre_footer_song_start = None
|
|
for i in range(search_from, first_footer - 1):
|
|
line = lines[i].replace('\f', '').strip()
|
|
next_line = lines[i + 1].replace('\f', '').strip() if i + 1 < first_footer else ''
|
|
# Look for: short text line followed by a chord line
|
|
if (line and not is_chord_line(line) and len(line) < 80 and
|
|
next_line and is_chord_line(next_line)):
|
|
pre_footer_song_start = i
|
|
break
|
|
|
|
if pre_footer_song_start is not None:
|
|
pre_block = lines[pre_footer_song_start:first_footer]
|
|
refs = parse_reference_numbers(
|
|
lines[first_footer],
|
|
lines[first_footer + 1] if first_footer + 1 < len(lines) else ""
|
|
)
|
|
song_blocks.append({
|
|
'lines': pre_block,
|
|
'refs': refs,
|
|
})
|
|
|
|
for fi, footer_idx in enumerate(footer_positions):
|
|
numbers_idx = footer_idx + 1
|
|
if numbers_idx >= len(lines):
|
|
break
|
|
|
|
# For the first footer, it's the transition from TOC to songs
|
|
if fi == 0:
|
|
current_block_start = numbers_idx + 1
|
|
continue
|
|
|
|
# Determine search floor (don't search before previous footer's numbers line)
|
|
prev_footer = footer_positions[fi - 1]
|
|
search_floor = prev_footer + 2 # Skip header + numbers line
|
|
|
|
# Collect lines from current_block_start to this footer
|
|
if current_block_start is not None:
|
|
block_lines = lines[current_block_start:footer_idx]
|
|
|
|
has_metadata = has_metadata_before(lines, footer_idx, search_floor)
|
|
after_start = numbers_idx + 1
|
|
new_song_follows = looks_like_new_song_after(lines, after_start)
|
|
|
|
# Decision: is this a song boundary?
|
|
# - If metadata is present AND new song follows: song boundary
|
|
# - If no metadata AND new song follows: song boundary (song without metadata)
|
|
# - If metadata is present but NO new song follows: mid-song page break
|
|
# (metadata appeared before the last page of the song)
|
|
# - If no metadata and no new song follows: mid-song page break
|
|
is_boundary = new_song_follows or (has_metadata and fi == len(footer_positions) - 1)
|
|
|
|
# Special case: if this is the last footer, it always ends the song
|
|
if fi == len(footer_positions) - 1:
|
|
is_boundary = True
|
|
|
|
if is_boundary:
|
|
current_block_lines.extend(block_lines)
|
|
refs = parse_reference_numbers(
|
|
lines[footer_idx],
|
|
lines[numbers_idx] if numbers_idx < len(lines) else ""
|
|
)
|
|
song_blocks.append({
|
|
'lines': current_block_lines,
|
|
'refs': refs,
|
|
})
|
|
current_block_lines = []
|
|
else:
|
|
# Mid-song page break - accumulate
|
|
current_block_lines.extend(block_lines)
|
|
|
|
current_block_start = numbers_idx + 1
|
|
|
|
# Handle remaining content
|
|
if current_block_start and current_block_start < len(lines):
|
|
remaining = lines[current_block_start:]
|
|
current_block_lines.extend(remaining)
|
|
|
|
if current_block_lines:
|
|
song_blocks.append({
|
|
'lines': current_block_lines,
|
|
'refs': SongReferences(),
|
|
})
|
|
|
|
return song_blocks
|
|
|
|
|
|
def extract_metadata_and_notes(block_lines: list):
|
|
"""Extract Worte, Weise, and commentary notes from the end of block lines.
|
|
|
|
Returns (content_lines, lyrics_author, composer, notes).
|
|
"""
|
|
lyrics_author = None
|
|
composer = None
|
|
notes_lines = []
|
|
|
|
# Find Worte/Weise lines working backwards
|
|
worte_idx = None
|
|
weise_idx = None
|
|
worte_und_weise_idx = None
|
|
metadata_start = len(block_lines)
|
|
|
|
for i in range(len(block_lines) - 1, max(-1, len(block_lines) - 50), -1):
|
|
line = block_lines[i].strip()
|
|
if not line:
|
|
continue
|
|
|
|
m = WORTE_UND_WEISE_RE.match(line)
|
|
if m:
|
|
worte_und_weise_idx = i
|
|
val = m.group(1).strip()
|
|
# Collect continuation lines
|
|
j = i + 1
|
|
while j < len(block_lines):
|
|
nl = block_lines[j].strip()
|
|
if not nl or WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl):
|
|
break
|
|
val += " " + nl
|
|
j += 1
|
|
lyrics_author = val
|
|
composer = val
|
|
metadata_start = i
|
|
break
|
|
|
|
mw = WORTE_RE.match(line)
|
|
if mw and not WORTE_UND_WEISE_RE.match(line):
|
|
if worte_idx is None:
|
|
worte_idx = i
|
|
val = mw.group(1).strip()
|
|
j = i + 1
|
|
while j < len(block_lines):
|
|
nl = block_lines[j].strip()
|
|
if not nl or WEISE_RE.match(nl) or WORTE_RE.match(nl) or is_bulibu_header(nl):
|
|
break
|
|
val += " " + nl
|
|
j += 1
|
|
lyrics_author = val
|
|
if weise_idx is not None:
|
|
metadata_start = min(worte_idx, weise_idx)
|
|
break
|
|
else:
|
|
metadata_start = worte_idx
|
|
continue
|
|
|
|
mws = WEISE_RE.match(line)
|
|
if mws:
|
|
if weise_idx is None:
|
|
weise_idx = i
|
|
val = mws.group(1).strip()
|
|
j = i + 1
|
|
while j < len(block_lines):
|
|
nl = block_lines[j].strip()
|
|
if not nl or WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl):
|
|
break
|
|
val += " " + nl
|
|
j += 1
|
|
composer = val
|
|
if worte_idx is not None:
|
|
metadata_start = min(worte_idx, weise_idx)
|
|
break
|
|
else:
|
|
metadata_start = weise_idx
|
|
continue
|
|
|
|
# Determine final metadata_start and metadata_end
|
|
metadata_end = metadata_start
|
|
if worte_und_weise_idx is not None:
|
|
metadata_start = worte_und_weise_idx
|
|
# Find end of the Worte und Weise block (including continuation lines)
|
|
j = worte_und_weise_idx + 1
|
|
while j < len(block_lines):
|
|
nl = block_lines[j].strip()
|
|
if not nl:
|
|
break
|
|
if WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl):
|
|
break
|
|
j += 1
|
|
metadata_end = j
|
|
elif worte_idx is not None and weise_idx is not None:
|
|
metadata_start = min(worte_idx, weise_idx)
|
|
metadata_end = max(worte_idx, weise_idx) + 1
|
|
# Include continuation lines after the later one
|
|
later_idx = max(worte_idx, weise_idx)
|
|
j = later_idx + 1
|
|
while j < len(block_lines):
|
|
nl = block_lines[j].strip()
|
|
if not nl:
|
|
break
|
|
if WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl):
|
|
break
|
|
j += 1
|
|
metadata_end = j
|
|
elif worte_idx is not None:
|
|
metadata_start = worte_idx
|
|
j = worte_idx + 1
|
|
while j < len(block_lines):
|
|
nl = block_lines[j].strip()
|
|
if not nl:
|
|
break
|
|
j += 1
|
|
metadata_end = j
|
|
elif weise_idx is not None:
|
|
metadata_start = weise_idx
|
|
j = weise_idx + 1
|
|
while j < len(block_lines):
|
|
nl = block_lines[j].strip()
|
|
if not nl:
|
|
break
|
|
j += 1
|
|
metadata_end = j
|
|
|
|
# Content is everything EXCEPT the metadata lines.
|
|
# If there's content AFTER the metadata, include it too.
|
|
content_before = block_lines[:metadata_start]
|
|
content_after = block_lines[metadata_end:]
|
|
|
|
# Check if content_after has actual song content (not just whitespace)
|
|
has_after_content = any(l.strip() for l in content_after)
|
|
if has_after_content:
|
|
content_lines = content_before + content_after
|
|
else:
|
|
content_lines = content_before
|
|
|
|
# Look for commentary/notes between song content and metadata
|
|
# Notes are prose paragraphs separated by blank lines from song content
|
|
# Heuristic: look for a blank line gap, then check if text after it is prose
|
|
last_content = len(content_lines) - 1
|
|
while last_content >= 0 and not content_lines[last_content].strip():
|
|
last_content -= 1
|
|
|
|
if last_content >= 0:
|
|
# Search backwards for a blank-line boundary
|
|
for j in range(last_content, 0, -1):
|
|
if not content_lines[j].strip():
|
|
# Check if the block after this blank line is commentary
|
|
candidate = [l.strip() for l in content_lines[j+1:last_content+1] if l.strip()]
|
|
if candidate and len(candidate) >= 1:
|
|
avg_len = sum(len(l) for l in candidate) / len(candidate)
|
|
has_period = any('.' in l for l in candidate)
|
|
if avg_len > 60 and has_period:
|
|
notes_lines = candidate
|
|
content_lines = content_lines[:j]
|
|
break
|
|
|
|
return content_lines, lyrics_author, composer, notes_lines
|
|
|
|
|
|
def extract_title(content_lines: list) -> tuple:
|
|
"""Extract the song title from the beginning of content lines.
|
|
|
|
The title appears on line(s) before the first chord line.
|
|
Often the title is the same text as (or a prefix of) the first lyric line.
|
|
|
|
Returns (title, remaining_lines).
|
|
"""
|
|
# Skip empty lines and form feeds at the start
|
|
start = 0
|
|
while start < len(content_lines):
|
|
line = content_lines[start].replace('\f', '').strip()
|
|
if line:
|
|
break
|
|
start += 1
|
|
|
|
if start >= len(content_lines):
|
|
return "", content_lines
|
|
|
|
# Verse label patterns that should NOT be part of the title
|
|
VERSE_LABEL_RE = re.compile(r'^(Ref\.?:|Refrain:?|Refr\.?:)', re.IGNORECASE)
|
|
|
|
# Strategy: collect title lines until we hit a chord line or verse label.
|
|
# The line BEFORE the first chord line is the last title line.
|
|
title_lines = []
|
|
|
|
for i in range(start, len(content_lines)):
|
|
line = content_lines[i].replace('\f', '').strip()
|
|
if not line:
|
|
if title_lines:
|
|
remaining = content_lines[i:]
|
|
return " ".join(title_lines), remaining
|
|
continue
|
|
|
|
if is_chord_line(line):
|
|
if title_lines:
|
|
remaining = content_lines[i:]
|
|
return " ".join(title_lines), remaining
|
|
else:
|
|
return "", content_lines[start:]
|
|
|
|
# If this line is a verse label (Ref.:, etc.), stop title collection
|
|
if VERSE_LABEL_RE.match(line):
|
|
if title_lines:
|
|
remaining = content_lines[i:]
|
|
return " ".join(title_lines), remaining
|
|
# Verse label as first line - unusual; skip it
|
|
continue
|
|
|
|
# Check if the next non-empty, non-label line is a chord line
|
|
next_chord = False
|
|
for j in range(i + 1, min(i + 4, len(content_lines))):
|
|
nxt = content_lines[j].replace('\f', '').strip()
|
|
if nxt:
|
|
if is_chord_line(nxt):
|
|
next_chord = True
|
|
elif VERSE_LABEL_RE.match(nxt):
|
|
continue # Skip verse labels in lookahead
|
|
break
|
|
|
|
if next_chord and not title_lines:
|
|
title_lines.append(line)
|
|
remaining = content_lines[i + 1:]
|
|
return " ".join(title_lines), remaining
|
|
elif next_chord and title_lines:
|
|
title_lines.append(line)
|
|
remaining = content_lines[i + 1:]
|
|
return " ".join(title_lines), remaining
|
|
else:
|
|
if not title_lines:
|
|
title_lines.append(line)
|
|
else:
|
|
remaining = content_lines[i:]
|
|
return " ".join(title_lines), remaining
|
|
|
|
# Fell through - use whatever we collected
|
|
if title_lines:
|
|
return " ".join(title_lines), []
|
|
return "", content_lines[start:]
|
|
|
|
|
|
def parse_song_block(block: dict) -> Optional[Song]:
|
|
"""Parse a raw song block into a Song object."""
|
|
block_lines = block['lines']
|
|
refs = block['refs']
|
|
|
|
cleaned_lines = [l.rstrip('\n').replace('\f', '') for l in block_lines]
|
|
|
|
while cleaned_lines and not cleaned_lines[0].strip():
|
|
cleaned_lines.pop(0)
|
|
while cleaned_lines and not cleaned_lines[-1].strip():
|
|
cleaned_lines.pop()
|
|
|
|
if not cleaned_lines:
|
|
return None
|
|
|
|
content_lines, lyrics_author, composer, notes = extract_metadata_and_notes(cleaned_lines)
|
|
|
|
if not content_lines:
|
|
# Song might be metadata-only (e.g., Tischlieder, Trinksprüche)
|
|
return None
|
|
|
|
while content_lines and not content_lines[-1].strip():
|
|
content_lines.pop()
|
|
while content_lines and not content_lines[0].strip():
|
|
content_lines.pop(0)
|
|
|
|
title, song_lines = extract_title(content_lines)
|
|
|
|
if not title:
|
|
# Try to derive title from first lyric line
|
|
# (Some songs start directly with chords)
|
|
for l in song_lines:
|
|
s = l.replace('\f', '').strip()
|
|
if s and not is_chord_line(s):
|
|
# Use first few words as title
|
|
title = s
|
|
if len(title) > 60:
|
|
title = title[:60].rsplit(' ', 1)[0]
|
|
break
|
|
|
|
if not title:
|
|
return None
|
|
|
|
title = re.sub(r'\s+', ' ', title).strip()
|
|
|
|
# Filter out non-song blocks
|
|
# - Title is just a number (page number artifact)
|
|
# - Title mentions "Illustration" (illustration page)
|
|
if re.match(r'^\d+$', title):
|
|
return None
|
|
if 'Illustration' in title:
|
|
return None
|
|
|
|
song = Song(
|
|
title=title,
|
|
lyrics_author=lyrics_author,
|
|
composer=composer,
|
|
notes=notes,
|
|
raw_lines=song_lines,
|
|
refs=refs,
|
|
)
|
|
|
|
return song
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Import songs from Carmina Leonis text extract")
|
|
parser.add_argument("--input", default=INPUT_FILE, help="Path to extracted text file")
|
|
parser.add_argument("--output-dir", default=OUTPUT_DIR, help="Output directory for .tex files")
|
|
parser.add_argument("--all-songs", default=ALL_SONGS_FILE, help="Path for all-songs.tex")
|
|
parser.add_argument("--dry-run", action="store_true", help="Parse but don't write files")
|
|
args = parser.parse_args()
|
|
|
|
print(f"Reading input from: {args.input}")
|
|
lines = read_input(args.input)
|
|
print(f"Read {len(lines)} lines")
|
|
|
|
song_start = find_song_start(lines)
|
|
print(f"Songs start at line {song_start + 1}")
|
|
|
|
raw_blocks = split_into_raw_songs(lines, song_start)
|
|
print(f"Found {len(raw_blocks)} raw song blocks")
|
|
|
|
songs = []
|
|
skipped = 0
|
|
for block in raw_blocks:
|
|
song = parse_song_block(block)
|
|
if song:
|
|
songs.append(song)
|
|
else:
|
|
skipped += 1
|
|
|
|
print(f"Parsed {len(songs)} songs ({skipped} blocks skipped)")
|
|
|
|
if args.dry_run:
|
|
for s in songs:
|
|
cl = s.refs.cl or "?"
|
|
print(f" CL {cl:>4s}: {s.title}")
|
|
return
|
|
|
|
os.makedirs(args.output_dir, exist_ok=True)
|
|
|
|
written = 0
|
|
skipped_existing = 0
|
|
filenames = []
|
|
used_filenames = set()
|
|
|
|
for song in songs:
|
|
filename = sanitize_filename(song.title)
|
|
if not filename:
|
|
print(f" WARNING: Could not generate filename for '{song.title}', skipping")
|
|
continue
|
|
|
|
base_filename = filename
|
|
counter = 2
|
|
while filename + ".tex" in used_filenames:
|
|
filename = f"{base_filename}-{counter}"
|
|
counter += 1
|
|
used_filenames.add(filename + ".tex")
|
|
|
|
tex_filename = filename + ".tex"
|
|
filepath = os.path.join(args.output_dir, tex_filename)
|
|
|
|
if tex_filename in EXISTING_SONGS:
|
|
print(f" SKIP (existing): {tex_filename}")
|
|
skipped_existing += 1
|
|
filenames.append((filename, song.title))
|
|
continue
|
|
|
|
tex_content = format_song_tex(song)
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
f.write(tex_content)
|
|
|
|
filenames.append((filename, song.title))
|
|
written += 1
|
|
|
|
print(f"\nWrote {written} song files ({skipped_existing} existing songs preserved)")
|
|
|
|
filenames.sort(key=lambda x: x[1].lower())
|
|
|
|
with open(args.all_songs, 'w', encoding='utf-8') as f:
|
|
f.write("% Auto-generated list of all songs (alphabetical order)\n")
|
|
f.write("% Generated by import-songs.py\n\n")
|
|
for fname, title in filenames:
|
|
f.write(f"\\input{{songs/{fname}}}\n")
|
|
|
|
print(f"Generated {args.all_songs} with {len(filenames)} entries")
|
|
|
|
with_chords = sum(1 for s in songs if any(is_chord_line(l.strip()) for l in s.raw_lines if l.strip()))
|
|
with_worte = sum(1 for s in songs if s.lyrics_author)
|
|
with_weise = sum(1 for s in songs if s.composer)
|
|
with_cl = sum(1 for s in songs if s.refs.cl)
|
|
|
|
print(f"\nStatistics:")
|
|
print(f" Total songs: {len(songs)}")
|
|
print(f" With chords: {with_chords}")
|
|
print(f" With lyrics author (Worte): {with_worte}")
|
|
print(f" With composer (Weise): {with_weise}")
|
|
print(f" With CL number: {with_cl}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|