songbook/import-songs.py

#!/usr/bin/env python3
"""
Parse extracted text from Carmina Leonis songbook PDF and generate .tex song files
in leadsheets format.

Usage:
    python3 import-songs.py [--input /tmp/songbook-full.txt] [--output-dir songs/]
"""

import re
import os
import sys
import argparse
from dataclasses import dataclass, field
from typing import Optional


# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

INPUT_FILE = "/tmp/songbook-full.txt"
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "songs")
ALL_SONGS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "all-songs.tex")

# Existing hand-crafted songs that should NOT be overwritten
EXISTING_SONGS = {
    "abend-wird-es-wieder.tex",
    "auf-auf-zum-froehlichen-jagen.tex",
    "die-gedanken-sind-frei.tex",
    "hejo-spann-den-wagen-an.tex",
    "kein-schoener-land.tex",
}

# Single chord pattern: a letter A-H (German uses H for B natural), optionally with
# sharp/flat, then optional quality suffix.
# This matches ONE chord.
SINGLE_CHORD_RE = re.compile(
    r'[A-Ha-h][#b]?'
    r'(?:m(?:aj|in)?|dim|aug|sus[24]?|add\d{1,2}|7|6|9|11|13|°|ø|\+|/[A-Ha-h][#b]?)*'
)

# BuLiBu footer header line pattern (song pages have BuLiBuII on the same line)
BULIBU_HEADER_RE = re.compile(r'BuLiBu\s+BuLiBuII\s+CL\s+SwA', re.IGNORECASE)

# Alternate footer pattern at the very end of the book
BULIBU_HEADER_ALT_RE = re.compile(r'BuLiBu\s+BuLiBuII\s+SwA', re.IGNORECASE)

# Metadata patterns
WORTE_UND_WEISE_RE = re.compile(r'^\s*Worte\s+und\s+Weise\s*:\s*(.+)', re.IGNORECASE)
WORTE_RE = re.compile(r'^\s*Worte\s*:\s*(.+)', re.IGNORECASE)
WEISE_RE = re.compile(r'^\s*Weise\s*:\s*(.+)', re.IGNORECASE)


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------

@dataclass
class SongReferences:
    bulibu: Optional[str] = None
    bulibull: Optional[str] = None
    cl: Optional[str] = None
    swa: Optional[str] = None
    barde: Optional[str] = None
    libock: Optional[str] = None


@dataclass
class Song:
    title: str = ""
    lyrics_author: Optional[str] = None    # Worte
    composer: Optional[str] = None          # Weise
    notes: list = field(default_factory=list)  # Commentary paragraphs
    raw_lines: list = field(default_factory=list)  # All content lines (chord + lyric)
    refs: SongReferences = field(default_factory=SongReferences)


# ---------------------------------------------------------------------------
# Chord detection helpers
# ---------------------------------------------------------------------------

def can_split_into_chords(token: str) -> bool:
    """Check if a token is either a single chord or a concatenation of chords.

    Examples: 'G' -> True, 'De' -> True (D + e), 'FCdFCd' -> True,
              'Feld' -> False, 'the' -> False
    """
    token = token.strip("()")
    if not token:
        return False
    # Try to match the entire token as a sequence of chords
    pos = 0
    while pos < len(token):
        m = SINGLE_CHORD_RE.match(token, pos)
        if m and m.end() > pos:
            pos = m.end()
        else:
            return False
    return True


def split_chord_token(token: str) -> list:
    """Split a potentially concatenated chord token into individual chords.

    E.g., 'De' -> ['D', 'e'], 'FCdFCd' -> ['F', 'C', 'd', 'F', 'C', 'd']
    """
    clean = token.strip("()")
    chords = []
    pos = 0
    while pos < len(clean):
        m = SINGLE_CHORD_RE.match(clean, pos)
        if m and m.end() > pos:
            chords.append(m.group())
            pos = m.end()
        else:
            break
    return chords


def is_chord_line(line: str) -> bool:
    """Return True if *line* consists entirely of chord tokens (with spacing).

    Handles concatenated chord tokens from PDF text extraction (e.g., 'De', 'BF').
    """
    stripped = line.strip()
    if not stripped:
        return False
    tokens = stripped.split()
    if not tokens:
        return False
    # Every token must be decomposable into valid chords
    return all(can_split_into_chords(t) for t in tokens)


def is_bulibu_header(line: str) -> bool:
    """Return True if this line is a BuLiBu reference-footer header."""
    clean = line.replace('\f', '')
    return bool(BULIBU_HEADER_RE.search(clean) or BULIBU_HEADER_ALT_RE.search(clean))


def is_numbers_line(line: str) -> bool:
    """Return True if this line consists only of numbers separated by whitespace."""
    stripped = line.strip()
    if not stripped:
        return False
    tokens = stripped.split()
    return all(re.match(r'^\d+$', t) for t in tokens)


def parse_reference_numbers(header_line: str, numbers_line: str) -> SongReferences:
    """Parse the BuLiBu header + numbers line into SongReferences.

    The header columns are:  BuLiBu  BuLiBuII  CL  SwA  Barde  LiBock
    We figure out column positions from the header, then extract numbers
    from the corresponding positions in the numbers line.
    """
    refs = SongReferences()

    # Strip form feed characters that come from PDF page breaks
    header_line = header_line.replace('\f', '')
    numbers_line = numbers_line.replace('\f', '')

    # Skip alternate format
    if "SwA II" in header_line or "SwA  II" in header_line:
        return refs

    col_names = ["BuLiBu", "BuLiBuII", "CL", "SwA", "Barde", "LiBock"]

    positions = []
    search_start = 0
    for name in col_names:
        idx = header_line.find(name, search_start)
        if idx == -1:
            lower_line = header_line.lower()
            idx = lower_line.find(name.lower(), search_start)
        if idx >= 0:
            positions.append((name, idx))
            search_start = idx + len(name)
        else:
            positions.append((name, -1))

    attr_map = {
        "BuLiBu": "bulibu",
        "BuLiBuII": "bulibull",
        "CL": "cl",
        "SwA": "swa",
        "Barde": "barde",
        "LiBock": "libock",
    }

    valid_positions = [(name, pos) for name, pos in positions if pos >= 0]
    for i, (name, pos) in enumerate(valid_positions):
        if i + 1 < len(valid_positions):
            end = valid_positions[i + 1][1]
        else:
            end = max(len(header_line), len(numbers_line))

        segment = numbers_line[pos:end] if pos < len(numbers_line) else ""
        segment = segment.strip()

        if segment and re.match(r'^\d+$', segment):
            attr = attr_map.get(name)
            if attr:
                setattr(refs, attr, segment)

    return refs


# ---------------------------------------------------------------------------
# Filename and LaTeX helpers
# ---------------------------------------------------------------------------

def sanitize_filename(title: str) -> str:
    """Convert a song title to a sanitized filename (without .tex extension)."""
    name = title.lower()
    name = name.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss")
    name = name.replace("Ä", "ae").replace("Ö", "oe").replace("Ü", "ue")
    name = re.sub(r'\s+', '-', name)
    name = re.sub(r"[^a-z0-9\-]", "", name)
    name = re.sub(r'-+', '-', name)
    name = name.strip('-')
    if len(name) > 50:
        name = name[:50].rstrip('-')
    return name


def escape_latex(text: str) -> str:
    """Escape special LaTeX characters in text, preserving \\chord{} macros."""
    text = text.replace("&", r"\&")
    text = text.replace("%", r"\%")
    text = text.replace("$", r"\$")
    text = text.replace("#", r"\#")
    text = text.replace("_", r"\_")
    return text


# ---------------------------------------------------------------------------
# Chord-lyric merging
# ---------------------------------------------------------------------------

def parse_chords_with_positions(chord_line: str) -> list:
    """Parse a chord line and return list of (column_position, chord_name) tuples.

    Handles concatenated chord tokens by splitting them and distributing
    positions character by character.
    """
    chords = []
    i = 0
    while i < len(chord_line):
        if chord_line[i] != ' ':
            j = i
            while j < len(chord_line) and chord_line[j] != ' ':
                j += 1
            raw_token = chord_line[i:j]
            clean_token = raw_token.strip("()")

            if can_split_into_chords(clean_token):
                split = split_chord_token(clean_token)
                if len(split) == 1:
                    chords.append((i, split[0]))
                else:
                    # Distribute concatenated chords: first chord at position i,
                    # subsequent chords get spaced after
                    pos = i
                    for chord in split:
                        chords.append((pos, chord))
                        pos += len(chord)
            i = j
        else:
            i += 1
    return chords


def merge_chord_lyric(chord_line: str, lyric_line: str) -> str:
    """Merge a chord line and lyric line by inserting \\chord{X} at positions.

    Snaps chord positions to word boundaries: if a chord falls inside a word,
    it is moved to the start of that word.
    """
    chords = parse_chords_with_positions(chord_line)

    if not chords:
        return lyric_line.strip()

    original_len = len(lyric_line)

    # Only keep chords within lyric range
    in_range = [(pos, ch) for pos, ch in chords if pos < original_len]

    # Snap each chord position to the start of the word it falls in
    snapped = []
    for pos, chord in in_range:
        # If position is inside a word (not at a space or start of word),
        # scan backwards to find the word start
        snap_pos = pos
        if pos > 0 and pos < len(lyric_line) and lyric_line[pos] != ' ':
            # Check if previous char is also non-space (mid-word)
            if lyric_line[pos - 1] != ' ':
                # Scan backwards to find word start
                while snap_pos > 0 and lyric_line[snap_pos - 1] != ' ':
                    snap_pos -= 1
        # If we're at a space, move forward to the next non-space
        while snap_pos < len(lyric_line) and lyric_line[snap_pos] == ' ':
            snap_pos += 1
        snapped.append((snap_pos, chord))

    # Deduplicate: if multiple chords snap to the same position, keep all
    # but ensure we don't create \chord{A}\chord{B} without space
    # Insert from right to left to maintain positions
    result = lyric_line
    prev_insert_pos = len(result) + 1
    for pos, chord in reversed(snapped):
        chord_macro = f"\\chord{{{chord}}}"
        # Avoid inserting at same position as previous (from right)
        if pos < prev_insert_pos:
            result = result[:pos] + chord_macro + result[pos:]
            prev_insert_pos = pos

    result = result.strip()
    # Collapse multiple spaces but preserve single spaces
    result = re.sub(r'  +', ' ', result)

    # For adjacent chords without text between them (e.g. \chord{A}\chord{B}),
    # add a space to prevent leadsheets tabular nesting issues
    result = re.sub(r'\}(\\chord\{)', r'} \1', result)

    return result


# ---------------------------------------------------------------------------
# Song content processing
# ---------------------------------------------------------------------------

def process_song_lines(raw_lines: list) -> list:
    """Process raw song content lines into verses with chord merging.

    Returns a list of verses, where each verse is a list of merged lyric lines.
    """
    lines = [l.replace('\f', '') for l in raw_lines]
    verses = []
    current_verse = []
    i = 0

    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        if not stripped:
            if current_verse:
                verses.append(current_verse)
                current_verse = []
            i += 1
            continue

        if is_chord_line(stripped):
            chord_line = line
            if i + 1 < len(lines) and lines[i + 1].strip():
                next_line = lines[i + 1]
                next_stripped = next_line.strip()
                if is_chord_line(next_stripped):
                    # Two consecutive chord lines - skip the first
                    i += 1
                    continue
                else:
                    merged = merge_chord_lyric(chord_line, next_line)
                    current_verse.append(merged)
                    i += 2
                    continue
            else:
                # Chord line at end or before blank - skip
                i += 1
                continue

        current_verse.append(stripped)
        i += 1

    if current_verse:
        verses.append(current_verse)

    return verses


# ---------------------------------------------------------------------------
# Song formatting
# ---------------------------------------------------------------------------

def escape_property_value(val: str) -> str:
    """Escape special LaTeX characters in song property values."""
    val = val.replace("&", r"\&")
    val = val.replace("%", r"\%")
    val = val.replace("$", r"\$")
    val = val.replace("#", r"\#")
    val = val.replace("_", r"\_")
    return val


def format_song_tex(song: Song) -> str:
    """Format a Song object as a .tex file string."""
    props = []
    title = escape_property_value(song.title)
    props.append(f"  title = {{{title}}},")

    if song.lyrics_author:
        val = escape_property_value(song.lyrics_author.strip())
        props.append(f"  lyrics = {{{val}}},")

    if song.composer:
        val = escape_property_value(song.composer.strip())
        props.append(f"  composer = {{{val}}},")

    refs = song.refs
    if refs.bulibu:
        props.append(f"  bulibu = {refs.bulibu},")
    if refs.bulibull:
        props.append(f"  bulibull = {refs.bulibull},")
    if refs.cl:
        props.append(f"  cl = {refs.cl},")
    if refs.swa:
        props.append(f"  swa = {refs.swa},")
    if refs.barde:
        props.append(f"  barde = {refs.barde},")
    if refs.libock:
        props.append(f"  libock = {refs.libock},")

    header = "\\begin{song}{\n" + "\n".join(props) + "\n}\n"

    verses = process_song_lines(song.raw_lines)

    body_parts = []
    for verse in verses:
        if not verse:
            continue

        verse_lines = []
        for line in verse:
            placeholders = []
            def replace_chord(m, _ph=placeholders):
                _ph.append(m.group(0))
                return f"CHORDPLACEHOLDER{len(_ph)-1}ENDPLACEHOLDER"

            escaped = re.sub(r'\\chord\{[^}]*\}', replace_chord, line)
            escaped = escape_latex(escaped)
            for idx, ph in enumerate(placeholders):
                escaped = escaped.replace(f"CHORDPLACEHOLDER{idx}ENDPLACEHOLDER", ph)

            verse_lines.append(escaped)

        verse_text = " \\\\\n".join(verse_lines)
        body_parts.append(f"\\begin{{verse}}\n{verse_text}\n\\end{{verse}}")

    body = "\n\n".join(body_parts)

    note_section = ""
    if song.notes:
        note_lines = "\n".join(song.notes).split("\n")
        note_section = "\n" + "\n".join(f"% {l}" for l in note_lines) + "\n"

    return header + note_section + "\n" + body + "\n\n\\end{song}\n"


# ---------------------------------------------------------------------------
# Main parsing logic
# ---------------------------------------------------------------------------

def read_input(path: str) -> list:
    """Read the input file and return lines."""
    with open(path, 'r', encoding='utf-8') as f:
        return f.readlines()


def find_song_start(lines: list) -> int:
    """Find the line index where song footers begin.

    Song pages use 'BuLiBu  BuLiBuII  CL  SwA  Barde  LiBock'.
    TOC pages use a different layout with BuLiBu appearing separately.
    """
    for i, line in enumerate(lines):
        if is_bulibu_header(line):
            return i
    return 0


def has_metadata_before(lines: list, footer_idx: int, search_floor: int) -> bool:
    """Check if there's a Worte/Weise line between search_floor and footer_idx."""
    search_start = max(search_floor, footer_idx - 40)
    for j in range(search_start, footer_idx):
        line = lines[j].strip()
        if (WORTE_UND_WEISE_RE.match(line) or WORTE_RE.match(line) or
                WEISE_RE.match(line)):
            return True
    return False


def is_song_end_footer(lines: list, footer_idx: int, search_floor: int) -> bool:
    """Determine if a BuLiBu footer marks the end of a song.

    A footer ends a song if Worte/Weise metadata appears before it.
    Some songs don't have Worte/Weise at all, which we handle in the
    splitting logic.
    """
    return has_metadata_before(lines, footer_idx, search_floor)


def looks_like_new_song_after(lines: list, after_start: int) -> bool:
    """Check if the content after a footer looks like a new song starts.

    A new song typically has: a title line (short text) followed by a chord line.
    But sometimes there's a "Ref.:" or "Ref:" label between title and chords.
    A continuation has: lyrics continuing without a title+chord pattern.
    """
    if after_start >= len(lines):
        return False

    peek_lines = []
    for k in range(after_start, min(after_start + 12, len(lines))):
        s = lines[k].replace('\f', '').strip()
        if s and not is_bulibu_header(s):
            peek_lines.append(s)
        if len(peek_lines) >= 5:
            break

    if len(peek_lines) < 2:
        return False

    first = peek_lines[0]

    # Check if first line is a potential title (short text, not chords, not numbers-only)
    if is_chord_line(first) or len(first) >= 80:
        return False
    if re.match(r'^\d+(\s+\d+)*$', first.strip()):
        return False

    # Check if a chord line appears within the next few lines
    for pl in peek_lines[1:4]:
        if is_chord_line(pl):
            return True

    return False


def split_into_raw_songs(lines: list, start_idx: int) -> list:
    """Split the lines from start_idx onward into raw song blocks.

    Strategy:
    1. Find all BuLiBu footer positions.
    2. For each footer, determine if it's a song boundary or mid-song page break.
    3. A footer is a song boundary if:
       a) It has Worte/Weise metadata before it AND the content after looks like
          a new song (title + chords), OR
       b) It has no metadata but the content after looks like a new song.
    4. A footer is a mid-song page break if the content after does NOT look
       like a new song (regardless of metadata placement).
    """
    footer_positions = []
    for i in range(start_idx, len(lines)):
        if is_bulibu_header(lines[i]):
            footer_positions.append(i)

    if not footer_positions:
        return []

    song_blocks = []
    current_block_start = None
    current_block_lines = []

    # Check for a song BEFORE the first footer (between TOC and first song footer).
    # The first song in the book may appear before any BuLiBu footer.
    first_footer = footer_positions[0]
    # Search forward from ~100 lines before the first footer for a title + chord pattern
    search_from = max(0, first_footer - 100)
    pre_footer_song_start = None
    for i in range(search_from, first_footer - 1):
        line = lines[i].replace('\f', '').strip()
        next_line = lines[i + 1].replace('\f', '').strip() if i + 1 < first_footer else ''
        # Look for: short text line followed by a chord line
        if (line and not is_chord_line(line) and len(line) < 80 and
                next_line and is_chord_line(next_line)):
            pre_footer_song_start = i
            break

    if pre_footer_song_start is not None:
        pre_block = lines[pre_footer_song_start:first_footer]
        refs = parse_reference_numbers(
            lines[first_footer],
            lines[first_footer + 1] if first_footer + 1 < len(lines) else ""
        )
        song_blocks.append({
            'lines': pre_block,
            'refs': refs,
        })

    for fi, footer_idx in enumerate(footer_positions):
        numbers_idx = footer_idx + 1
        if numbers_idx >= len(lines):
            break

        # For the first footer, it's the transition from TOC to songs
        if fi == 0:
            current_block_start = numbers_idx + 1
            continue

        # Determine search floor (don't search before previous footer's numbers line)
        prev_footer = footer_positions[fi - 1]
        search_floor = prev_footer + 2  # Skip header + numbers line

        # Collect lines from current_block_start to this footer
        if current_block_start is not None:
            block_lines = lines[current_block_start:footer_idx]

            has_metadata = has_metadata_before(lines, footer_idx, search_floor)
            after_start = numbers_idx + 1
            new_song_follows = looks_like_new_song_after(lines, after_start)

            # Decision: is this a song boundary?
            # - If metadata is present AND new song follows: song boundary
            # - If no metadata AND new song follows: song boundary (song without metadata)
            # - If metadata is present but NO new song follows: mid-song page break
            #   (metadata appeared before the last page of the song)
            # - If no metadata and no new song follows: mid-song page break
            is_boundary = new_song_follows or (has_metadata and fi == len(footer_positions) - 1)

            # Special case: if this is the last footer, it always ends the song
            if fi == len(footer_positions) - 1:
                is_boundary = True

            if is_boundary:
                current_block_lines.extend(block_lines)
                refs = parse_reference_numbers(
                    lines[footer_idx],
                    lines[numbers_idx] if numbers_idx < len(lines) else ""
                )
                song_blocks.append({
                    'lines': current_block_lines,
                    'refs': refs,
                })
                current_block_lines = []
            else:
                # Mid-song page break - accumulate
                current_block_lines.extend(block_lines)

            current_block_start = numbers_idx + 1

    # Handle remaining content
    if current_block_start and current_block_start < len(lines):
        remaining = lines[current_block_start:]
        current_block_lines.extend(remaining)

    if current_block_lines:
        song_blocks.append({
            'lines': current_block_lines,
            'refs': SongReferences(),
        })

    return song_blocks


def extract_metadata_and_notes(block_lines: list):
    """Extract Worte, Weise, and commentary notes from the end of block lines.

    Returns (content_lines, lyrics_author, composer, notes).
    """
    lyrics_author = None
    composer = None
    notes_lines = []

    # Find Worte/Weise lines working backwards
    worte_idx = None
    weise_idx = None
    worte_und_weise_idx = None
    metadata_start = len(block_lines)

    for i in range(len(block_lines) - 1, max(-1, len(block_lines) - 50), -1):
        line = block_lines[i].strip()
        if not line:
            continue

        m = WORTE_UND_WEISE_RE.match(line)
        if m:
            worte_und_weise_idx = i
            val = m.group(1).strip()
            # Collect continuation lines
            j = i + 1
            while j < len(block_lines):
                nl = block_lines[j].strip()
                if not nl or WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl):
                    break
                val += " " + nl
                j += 1
            lyrics_author = val
            composer = val
            metadata_start = i
            break

        mw = WORTE_RE.match(line)
        if mw and not WORTE_UND_WEISE_RE.match(line):
            if worte_idx is None:
                worte_idx = i
                val = mw.group(1).strip()
                j = i + 1
                while j < len(block_lines):
                    nl = block_lines[j].strip()
                    if not nl or WEISE_RE.match(nl) or WORTE_RE.match(nl) or is_bulibu_header(nl):
                        break
                    val += " " + nl
                    j += 1
                lyrics_author = val
                if weise_idx is not None:
                    metadata_start = min(worte_idx, weise_idx)
                    break
                else:
                    metadata_start = worte_idx
                continue

        mws = WEISE_RE.match(line)
        if mws:
            if weise_idx is None:
                weise_idx = i
                val = mws.group(1).strip()
                j = i + 1
                while j < len(block_lines):
                    nl = block_lines[j].strip()
                    if not nl or WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl):
                        break
                    val += " " + nl
                    j += 1
                composer = val
                if worte_idx is not None:
                    metadata_start = min(worte_idx, weise_idx)
                    break
                else:
                    metadata_start = weise_idx
                continue

    # Determine final metadata_start and metadata_end
    metadata_end = metadata_start
    if worte_und_weise_idx is not None:
        metadata_start = worte_und_weise_idx
        # Find end of the Worte und Weise block (including continuation lines)
        j = worte_und_weise_idx + 1
        while j < len(block_lines):
            nl = block_lines[j].strip()
            if not nl:
                break
            if WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl):
                break
            j += 1
        metadata_end = j
    elif worte_idx is not None and weise_idx is not None:
        metadata_start = min(worte_idx, weise_idx)
        metadata_end = max(worte_idx, weise_idx) + 1
        # Include continuation lines after the later one
        later_idx = max(worte_idx, weise_idx)
        j = later_idx + 1
        while j < len(block_lines):
            nl = block_lines[j].strip()
            if not nl:
                break
            if WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl):
                break
            j += 1
        metadata_end = j
    elif worte_idx is not None:
        metadata_start = worte_idx
        j = worte_idx + 1
        while j < len(block_lines):
            nl = block_lines[j].strip()
            if not nl:
                break
            j += 1
        metadata_end = j
    elif weise_idx is not None:
        metadata_start = weise_idx
        j = weise_idx + 1
        while j < len(block_lines):
            nl = block_lines[j].strip()
            if not nl:
                break
            j += 1
        metadata_end = j

    # Content is everything EXCEPT the metadata lines.
    # If there's content AFTER the metadata, include it too.
    content_before = block_lines[:metadata_start]
    content_after = block_lines[metadata_end:]

    # Check if content_after has actual song content (not just whitespace)
    has_after_content = any(l.strip() for l in content_after)
    if has_after_content:
        content_lines = content_before + content_after
    else:
        content_lines = content_before

    # Look for commentary/notes between song content and metadata
    # Notes are prose paragraphs separated by blank lines from song content
    # Heuristic: look for a blank line gap, then check if text after it is prose
    last_content = len(content_lines) - 1
    while last_content >= 0 and not content_lines[last_content].strip():
        last_content -= 1

    if last_content >= 0:
        # Search backwards for a blank-line boundary
        for j in range(last_content, 0, -1):
            if not content_lines[j].strip():
                # Check if the block after this blank line is commentary
                candidate = [l.strip() for l in content_lines[j+1:last_content+1] if l.strip()]
                if candidate and len(candidate) >= 1:
                    avg_len = sum(len(l) for l in candidate) / len(candidate)
                    has_period = any('.' in l for l in candidate)
                    if avg_len > 60 and has_period:
                        notes_lines = candidate
                        content_lines = content_lines[:j]
                break

    return content_lines, lyrics_author, composer, notes_lines


def extract_title(content_lines: list) -> tuple:
    """Extract the song title from the beginning of content lines.

    The title appears on line(s) before the first chord line.
    Often the title is the same text as (or a prefix of) the first lyric line.

    Returns (title, remaining_lines).
    """
    # Skip empty lines and form feeds at the start
    start = 0
    while start < len(content_lines):
        line = content_lines[start].replace('\f', '').strip()
        if line:
            break
        start += 1

    if start >= len(content_lines):
        return "", content_lines

    # Verse label patterns that should NOT be part of the title
    VERSE_LABEL_RE = re.compile(r'^(Ref\.?:|Refrain:?|Refr\.?:)', re.IGNORECASE)

    # Strategy: collect title lines until we hit a chord line or verse label.
    # The line BEFORE the first chord line is the last title line.
    title_lines = []

    for i in range(start, len(content_lines)):
        line = content_lines[i].replace('\f', '').strip()
        if not line:
            if title_lines:
                remaining = content_lines[i:]
                return " ".join(title_lines), remaining
            continue

        if is_chord_line(line):
            if title_lines:
                remaining = content_lines[i:]
                return " ".join(title_lines), remaining
            else:
                return "", content_lines[start:]

        # If this line is a verse label (Ref.:, etc.), stop title collection
        if VERSE_LABEL_RE.match(line):
            if title_lines:
                remaining = content_lines[i:]
                return " ".join(title_lines), remaining
            # Verse label as first line - unusual; skip it
            continue

        # Check if the next non-empty, non-label line is a chord line
        next_chord = False
        for j in range(i + 1, min(i + 4, len(content_lines))):
            nxt = content_lines[j].replace('\f', '').strip()
            if nxt:
                if is_chord_line(nxt):
                    next_chord = True
                elif VERSE_LABEL_RE.match(nxt):
                    continue  # Skip verse labels in lookahead
                break

        if next_chord and not title_lines:
            title_lines.append(line)
            remaining = content_lines[i + 1:]
            return " ".join(title_lines), remaining
        elif next_chord and title_lines:
            title_lines.append(line)
            remaining = content_lines[i + 1:]
            return " ".join(title_lines), remaining
        else:
            if not title_lines:
                title_lines.append(line)
            else:
                remaining = content_lines[i:]
                return " ".join(title_lines), remaining

    # Fell through - use whatever we collected
    if title_lines:
        return " ".join(title_lines), []
    return "", content_lines[start:]


def parse_song_block(block: dict) -> Optional[Song]:
    """Parse a raw song block into a Song object."""
    block_lines = block['lines']
    refs = block['refs']

    cleaned_lines = [l.rstrip('\n').replace('\f', '') for l in block_lines]

    while cleaned_lines and not cleaned_lines[0].strip():
        cleaned_lines.pop(0)
    while cleaned_lines and not cleaned_lines[-1].strip():
        cleaned_lines.pop()

    if not cleaned_lines:
        return None

    content_lines, lyrics_author, composer, notes = extract_metadata_and_notes(cleaned_lines)

    if not content_lines:
        # Song might be metadata-only (e.g., Tischlieder, Trinksprüche)
        return None

    while content_lines and not content_lines[-1].strip():
        content_lines.pop()
    while content_lines and not content_lines[0].strip():
        content_lines.pop(0)

    title, song_lines = extract_title(content_lines)

    if not title:
        # Try to derive title from first lyric line
        # (Some songs start directly with chords)
        for l in song_lines:
            s = l.replace('\f', '').strip()
            if s and not is_chord_line(s):
                # Use first few words as title
                title = s
                if len(title) > 60:
                    title = title[:60].rsplit(' ', 1)[0]
                break

    if not title:
        return None

    title = re.sub(r'\s+', ' ', title).strip()

    # Filter out non-song blocks
    # - Title is just a number (page number artifact)
    # - Title mentions "Illustration" (illustration page)
    if re.match(r'^\d+$', title):
        return None
    if 'Illustration' in title:
        return None

    song = Song(
        title=title,
        lyrics_author=lyrics_author,
        composer=composer,
        notes=notes,
        raw_lines=song_lines,
        refs=refs,
    )

    return song


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(description="Import songs from Carmina Leonis text extract")
    parser.add_argument("--input", default=INPUT_FILE, help="Path to extracted text file")
    parser.add_argument("--output-dir", default=OUTPUT_DIR, help="Output directory for .tex files")
    parser.add_argument("--all-songs", default=ALL_SONGS_FILE, help="Path for all-songs.tex")
    parser.add_argument("--dry-run", action="store_true", help="Parse but don't write files")
    args = parser.parse_args()

    print(f"Reading input from: {args.input}")
    lines = read_input(args.input)
    print(f"Read {len(lines)} lines")

    song_start = find_song_start(lines)
    print(f"Songs start at line {song_start + 1}")

    raw_blocks = split_into_raw_songs(lines, song_start)
    print(f"Found {len(raw_blocks)} raw song blocks")

    songs = []
    skipped = 0
    for block in raw_blocks:
        song = parse_song_block(block)
        if song:
            songs.append(song)
        else:
            skipped += 1

    print(f"Parsed {len(songs)} songs ({skipped} blocks skipped)")

    if args.dry_run:
        for s in songs:
            cl = s.refs.cl or "?"
            print(f"  CL {cl:>4s}: {s.title}")
        return

    os.makedirs(args.output_dir, exist_ok=True)

    written = 0
    skipped_existing = 0
    filenames = []
    used_filenames = set()

    for song in songs:
        filename = sanitize_filename(song.title)
        if not filename:
            print(f"  WARNING: Could not generate filename for '{song.title}', skipping")
            continue

        base_filename = filename
        counter = 2
        while filename + ".tex" in used_filenames:
            filename = f"{base_filename}-{counter}"
            counter += 1
        used_filenames.add(filename + ".tex")

        tex_filename = filename + ".tex"
        filepath = os.path.join(args.output_dir, tex_filename)

        if tex_filename in EXISTING_SONGS:
            print(f"  SKIP (existing): {tex_filename}")
            skipped_existing += 1
            filenames.append((filename, song.title))
            continue

        tex_content = format_song_tex(song)

        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(tex_content)

        filenames.append((filename, song.title))
        written += 1

    print(f"\nWrote {written} song files ({skipped_existing} existing songs preserved)")

    filenames.sort(key=lambda x: x[1].lower())

    with open(args.all_songs, 'w', encoding='utf-8') as f:
        f.write("% Auto-generated list of all songs (alphabetical order)\n")
        f.write("% Generated by import-songs.py\n\n")
        for fname, title in filenames:
            f.write(f"\\input{{songs/{fname}}}\n")

    print(f"Generated {args.all_songs} with {len(filenames)} entries")

    with_chords = sum(1 for s in songs if any(is_chord_line(l.strip()) for l in s.raw_lines if l.strip()))
    with_worte = sum(1 for s in songs if s.lyrics_author)
    with_weise = sum(1 for s in songs if s.composer)
    with_cl = sum(1 for s in songs if s.refs.cl)

    print(f"\nStatistics:")
    print(f"  Total songs: {len(songs)}")
    print(f"  With chords: {with_chords}")
    print(f"  With lyrics author (Worte): {with_worte}")
    print(f"  With composer (Weise): {with_weise}")
    print(f"  With CL number: {with_cl}")


if __name__ == "__main__":
    main()