#!/usr/bin/env python3 """ Parse extracted text from Carmina Leonis songbook PDF and generate .tex song files in leadsheets format. Usage: python3 import-songs.py [--input /tmp/songbook-full.txt] [--output-dir songs/] """ import re import os import sys import argparse from dataclasses import dataclass, field from typing import Optional # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- INPUT_FILE = "/tmp/songbook-full.txt" OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "songs") ALL_SONGS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "all-songs.tex") # Existing hand-crafted songs that should NOT be overwritten EXISTING_SONGS = { "abend-wird-es-wieder.tex", "auf-auf-zum-froehlichen-jagen.tex", "die-gedanken-sind-frei.tex", "hejo-spann-den-wagen-an.tex", "kein-schoener-land.tex", } # Single chord pattern: a letter A-H (German uses H for B natural), optionally with # sharp/flat, then optional quality suffix. # This matches ONE chord. SINGLE_CHORD_RE = re.compile( r'[A-Ha-h][#b]?' r'(?:m(?:aj|in)?|dim|aug|sus[24]?|add\d{1,2}|7|6|9|11|13|°|ø|\+|/[A-Ha-h][#b]?)*' ) # BuLiBu footer header line pattern (song pages have BuLiBuII on the same line) BULIBU_HEADER_RE = re.compile(r'BuLiBu\s+BuLiBuII\s+CL\s+SwA', re.IGNORECASE) # Alternate footer pattern at the very end of the book BULIBU_HEADER_ALT_RE = re.compile(r'BuLiBu\s+BuLiBuII\s+SwA', re.IGNORECASE) # Metadata patterns WORTE_UND_WEISE_RE = re.compile(r'^\s*Worte\s+und\s+Weise\s*:\s*(.+)', re.IGNORECASE) WORTE_RE = re.compile(r'^\s*Worte\s*:\s*(.+)', re.IGNORECASE) WEISE_RE = re.compile(r'^\s*Weise\s*:\s*(.+)', re.IGNORECASE) # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass class SongReferences: bulibu: Optional[str] = None bulibull: Optional[str] = None cl: Optional[str] = None swa: Optional[str] = None barde: Optional[str] = None libock: Optional[str] = None @dataclass class Song: title: str = "" lyrics_author: Optional[str] = None # Worte composer: Optional[str] = None # Weise notes: list = field(default_factory=list) # Commentary paragraphs raw_lines: list = field(default_factory=list) # All content lines (chord + lyric) refs: SongReferences = field(default_factory=SongReferences) # --------------------------------------------------------------------------- # Chord detection helpers # --------------------------------------------------------------------------- def can_split_into_chords(token: str) -> bool: """Check if a token is either a single chord or a concatenation of chords. Examples: 'G' -> True, 'De' -> True (D + e), 'FCdFCd' -> True, 'Feld' -> False, 'the' -> False """ token = token.strip("()") if not token: return False # Try to match the entire token as a sequence of chords pos = 0 while pos < len(token): m = SINGLE_CHORD_RE.match(token, pos) if m and m.end() > pos: pos = m.end() else: return False return True def split_chord_token(token: str) -> list: """Split a potentially concatenated chord token into individual chords. E.g., 'De' -> ['D', 'e'], 'FCdFCd' -> ['F', 'C', 'd', 'F', 'C', 'd'] """ clean = token.strip("()") chords = [] pos = 0 while pos < len(clean): m = SINGLE_CHORD_RE.match(clean, pos) if m and m.end() > pos: chords.append(m.group()) pos = m.end() else: break return chords def is_chord_line(line: str) -> bool: """Return True if *line* consists entirely of chord tokens (with spacing). Handles concatenated chord tokens from PDF text extraction (e.g., 'De', 'BF'). """ stripped = line.strip() if not stripped: return False tokens = stripped.split() if not tokens: return False # Every token must be decomposable into valid chords return all(can_split_into_chords(t) for t in tokens) def is_bulibu_header(line: str) -> bool: """Return True if this line is a BuLiBu reference-footer header.""" clean = line.replace('\f', '') return bool(BULIBU_HEADER_RE.search(clean) or BULIBU_HEADER_ALT_RE.search(clean)) def is_numbers_line(line: str) -> bool: """Return True if this line consists only of numbers separated by whitespace.""" stripped = line.strip() if not stripped: return False tokens = stripped.split() return all(re.match(r'^\d+$', t) for t in tokens) def parse_reference_numbers(header_line: str, numbers_line: str) -> SongReferences: """Parse the BuLiBu header + numbers line into SongReferences. The header columns are: BuLiBu BuLiBuII CL SwA Barde LiBock We figure out column positions from the header, then extract numbers from the corresponding positions in the numbers line. """ refs = SongReferences() # Strip form feed characters that come from PDF page breaks header_line = header_line.replace('\f', '') numbers_line = numbers_line.replace('\f', '') # Skip alternate format if "SwA II" in header_line or "SwA II" in header_line: return refs col_names = ["BuLiBu", "BuLiBuII", "CL", "SwA", "Barde", "LiBock"] positions = [] search_start = 0 for name in col_names: idx = header_line.find(name, search_start) if idx == -1: lower_line = header_line.lower() idx = lower_line.find(name.lower(), search_start) if idx >= 0: positions.append((name, idx)) search_start = idx + len(name) else: positions.append((name, -1)) attr_map = { "BuLiBu": "bulibu", "BuLiBuII": "bulibull", "CL": "cl", "SwA": "swa", "Barde": "barde", "LiBock": "libock", } valid_positions = [(name, pos) for name, pos in positions if pos >= 0] for i, (name, pos) in enumerate(valid_positions): if i + 1 < len(valid_positions): end = valid_positions[i + 1][1] else: end = max(len(header_line), len(numbers_line)) segment = numbers_line[pos:end] if pos < len(numbers_line) else "" segment = segment.strip() if segment and re.match(r'^\d+$', segment): attr = attr_map.get(name) if attr: setattr(refs, attr, segment) return refs # --------------------------------------------------------------------------- # Filename and LaTeX helpers # --------------------------------------------------------------------------- def sanitize_filename(title: str) -> str: """Convert a song title to a sanitized filename (without .tex extension).""" name = title.lower() name = name.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss") name = name.replace("Ä", "ae").replace("Ö", "oe").replace("Ü", "ue") name = re.sub(r'\s+', '-', name) name = re.sub(r"[^a-z0-9\-]", "", name) name = re.sub(r'-+', '-', name) name = name.strip('-') if len(name) > 50: name = name[:50].rstrip('-') return name def escape_latex(text: str) -> str: """Escape special LaTeX characters in text, preserving \\chord{} macros.""" text = text.replace("&", r"\&") text = text.replace("%", r"\%") text = text.replace("$", r"\$") text = text.replace("#", r"\#") text = text.replace("_", r"\_") return text # --------------------------------------------------------------------------- # Chord-lyric merging # --------------------------------------------------------------------------- def parse_chords_with_positions(chord_line: str) -> list: """Parse a chord line and return list of (column_position, chord_name) tuples. Handles concatenated chord tokens by splitting them and distributing positions character by character. """ chords = [] i = 0 while i < len(chord_line): if chord_line[i] != ' ': j = i while j < len(chord_line) and chord_line[j] != ' ': j += 1 raw_token = chord_line[i:j] clean_token = raw_token.strip("()") if can_split_into_chords(clean_token): split = split_chord_token(clean_token) if len(split) == 1: chords.append((i, split[0])) else: # Distribute concatenated chords: first chord at position i, # subsequent chords get spaced after pos = i for chord in split: chords.append((pos, chord)) pos += len(chord) i = j else: i += 1 return chords def merge_chord_lyric(chord_line: str, lyric_line: str) -> str: """Merge a chord line and lyric line by inserting \\chord{X} at positions. Snaps chord positions to word boundaries: if a chord falls inside a word, it is moved to the start of that word. """ chords = parse_chords_with_positions(chord_line) if not chords: return lyric_line.strip() original_len = len(lyric_line) # Only keep chords within lyric range in_range = [(pos, ch) for pos, ch in chords if pos < original_len] # Snap each chord position to the start of the word it falls in snapped = [] for pos, chord in in_range: # If position is inside a word (not at a space or start of word), # scan backwards to find the word start snap_pos = pos if pos > 0 and pos < len(lyric_line) and lyric_line[pos] != ' ': # Check if previous char is also non-space (mid-word) if lyric_line[pos - 1] != ' ': # Scan backwards to find word start while snap_pos > 0 and lyric_line[snap_pos - 1] != ' ': snap_pos -= 1 # If we're at a space, move forward to the next non-space while snap_pos < len(lyric_line) and lyric_line[snap_pos] == ' ': snap_pos += 1 snapped.append((snap_pos, chord)) # Deduplicate: if multiple chords snap to the same position, keep all # but ensure we don't create \chord{A}\chord{B} without space # Insert from right to left to maintain positions result = lyric_line prev_insert_pos = len(result) + 1 for pos, chord in reversed(snapped): chord_macro = f"\\chord{{{chord}}}" # Avoid inserting at same position as previous (from right) if pos < prev_insert_pos: result = result[:pos] + chord_macro + result[pos:] prev_insert_pos = pos result = result.strip() # Collapse multiple spaces but preserve single spaces result = re.sub(r' +', ' ', result) # For adjacent chords without text between them (e.g. \chord{A}\chord{B}), # add a space to prevent leadsheets tabular nesting issues result = re.sub(r'\}(\\chord\{)', r'} \1', result) return result # --------------------------------------------------------------------------- # Song content processing # --------------------------------------------------------------------------- def process_song_lines(raw_lines: list) -> list: """Process raw song content lines into verses with chord merging. Returns a list of verses, where each verse is a list of merged lyric lines. """ lines = [l.replace('\f', '') for l in raw_lines] verses = [] current_verse = [] i = 0 while i < len(lines): line = lines[i] stripped = line.strip() if not stripped: if current_verse: verses.append(current_verse) current_verse = [] i += 1 continue if is_chord_line(stripped): chord_line = line if i + 1 < len(lines) and lines[i + 1].strip(): next_line = lines[i + 1] next_stripped = next_line.strip() if is_chord_line(next_stripped): # Two consecutive chord lines - skip the first i += 1 continue else: merged = merge_chord_lyric(chord_line, next_line) current_verse.append(merged) i += 2 continue else: # Chord line at end or before blank - skip i += 1 continue current_verse.append(stripped) i += 1 if current_verse: verses.append(current_verse) return verses # --------------------------------------------------------------------------- # Song formatting # --------------------------------------------------------------------------- def escape_property_value(val: str) -> str: """Escape special LaTeX characters in song property values.""" val = val.replace("&", r"\&") val = val.replace("%", r"\%") val = val.replace("$", r"\$") val = val.replace("#", r"\#") val = val.replace("_", r"\_") return val def format_song_tex(song: Song) -> str: """Format a Song object as a .tex file string.""" props = [] title = escape_property_value(song.title) props.append(f" title = {{{title}}},") if song.lyrics_author: val = escape_property_value(song.lyrics_author.strip()) props.append(f" lyrics = {{{val}}},") if song.composer: val = escape_property_value(song.composer.strip()) props.append(f" composer = {{{val}}},") refs = song.refs if refs.bulibu: props.append(f" bulibu = {refs.bulibu},") if refs.bulibull: props.append(f" bulibull = {refs.bulibull},") if refs.cl: props.append(f" cl = {refs.cl},") if refs.swa: props.append(f" swa = {refs.swa},") if refs.barde: props.append(f" barde = {refs.barde},") if refs.libock: props.append(f" libock = {refs.libock},") header = "\\begin{song}{\n" + "\n".join(props) + "\n}\n" verses = process_song_lines(song.raw_lines) body_parts = [] for verse in verses: if not verse: continue verse_lines = [] for line in verse: placeholders = [] def replace_chord(m, _ph=placeholders): _ph.append(m.group(0)) return f"CHORDPLACEHOLDER{len(_ph)-1}ENDPLACEHOLDER" escaped = re.sub(r'\\chord\{[^}]*\}', replace_chord, line) escaped = escape_latex(escaped) for idx, ph in enumerate(placeholders): escaped = escaped.replace(f"CHORDPLACEHOLDER{idx}ENDPLACEHOLDER", ph) verse_lines.append(escaped) verse_text = " \\\\\n".join(verse_lines) body_parts.append(f"\\begin{{verse}}\n{verse_text}\n\\end{{verse}}") body = "\n\n".join(body_parts) note_section = "" if song.notes: note_lines = "\n".join(song.notes).split("\n") note_section = "\n" + "\n".join(f"% {l}" for l in note_lines) + "\n" return header + note_section + "\n" + body + "\n\n\\end{song}\n" # --------------------------------------------------------------------------- # Main parsing logic # --------------------------------------------------------------------------- def read_input(path: str) -> list: """Read the input file and return lines.""" with open(path, 'r', encoding='utf-8') as f: return f.readlines() def find_song_start(lines: list) -> int: """Find the line index where song footers begin. Song pages use 'BuLiBu BuLiBuII CL SwA Barde LiBock'. TOC pages use a different layout with BuLiBu appearing separately. """ for i, line in enumerate(lines): if is_bulibu_header(line): return i return 0 def has_metadata_before(lines: list, footer_idx: int, search_floor: int) -> bool: """Check if there's a Worte/Weise line between search_floor and footer_idx.""" search_start = max(search_floor, footer_idx - 40) for j in range(search_start, footer_idx): line = lines[j].strip() if (WORTE_UND_WEISE_RE.match(line) or WORTE_RE.match(line) or WEISE_RE.match(line)): return True return False def is_song_end_footer(lines: list, footer_idx: int, search_floor: int) -> bool: """Determine if a BuLiBu footer marks the end of a song. A footer ends a song if Worte/Weise metadata appears before it. Some songs don't have Worte/Weise at all, which we handle in the splitting logic. """ return has_metadata_before(lines, footer_idx, search_floor) def looks_like_new_song_after(lines: list, after_start: int) -> bool: """Check if the content after a footer looks like a new song starts. A new song typically has: a title line (short text) followed by a chord line. But sometimes there's a "Ref.:" or "Ref:" label between title and chords. A continuation has: lyrics continuing without a title+chord pattern. """ if after_start >= len(lines): return False peek_lines = [] for k in range(after_start, min(after_start + 12, len(lines))): s = lines[k].replace('\f', '').strip() if s and not is_bulibu_header(s): peek_lines.append(s) if len(peek_lines) >= 5: break if len(peek_lines) < 2: return False first = peek_lines[0] # Check if first line is a potential title (short text, not chords, not numbers-only) if is_chord_line(first) or len(first) >= 80: return False if re.match(r'^\d+(\s+\d+)*$', first.strip()): return False # Check if a chord line appears within the next few lines for pl in peek_lines[1:4]: if is_chord_line(pl): return True return False def split_into_raw_songs(lines: list, start_idx: int) -> list: """Split the lines from start_idx onward into raw song blocks. Strategy: 1. Find all BuLiBu footer positions. 2. For each footer, determine if it's a song boundary or mid-song page break. 3. A footer is a song boundary if: a) It has Worte/Weise metadata before it AND the content after looks like a new song (title + chords), OR b) It has no metadata but the content after looks like a new song. 4. A footer is a mid-song page break if the content after does NOT look like a new song (regardless of metadata placement). """ footer_positions = [] for i in range(start_idx, len(lines)): if is_bulibu_header(lines[i]): footer_positions.append(i) if not footer_positions: return [] song_blocks = [] current_block_start = None current_block_lines = [] # Check for a song BEFORE the first footer (between TOC and first song footer). # The first song in the book may appear before any BuLiBu footer. first_footer = footer_positions[0] # Search forward from ~100 lines before the first footer for a title + chord pattern search_from = max(0, first_footer - 100) pre_footer_song_start = None for i in range(search_from, first_footer - 1): line = lines[i].replace('\f', '').strip() next_line = lines[i + 1].replace('\f', '').strip() if i + 1 < first_footer else '' # Look for: short text line followed by a chord line if (line and not is_chord_line(line) and len(line) < 80 and next_line and is_chord_line(next_line)): pre_footer_song_start = i break if pre_footer_song_start is not None: pre_block = lines[pre_footer_song_start:first_footer] refs = parse_reference_numbers( lines[first_footer], lines[first_footer + 1] if first_footer + 1 < len(lines) else "" ) song_blocks.append({ 'lines': pre_block, 'refs': refs, }) for fi, footer_idx in enumerate(footer_positions): numbers_idx = footer_idx + 1 if numbers_idx >= len(lines): break # For the first footer, it's the transition from TOC to songs if fi == 0: current_block_start = numbers_idx + 1 continue # Determine search floor (don't search before previous footer's numbers line) prev_footer = footer_positions[fi - 1] search_floor = prev_footer + 2 # Skip header + numbers line # Collect lines from current_block_start to this footer if current_block_start is not None: block_lines = lines[current_block_start:footer_idx] has_metadata = has_metadata_before(lines, footer_idx, search_floor) after_start = numbers_idx + 1 new_song_follows = looks_like_new_song_after(lines, after_start) # Decision: is this a song boundary? # - If metadata is present AND new song follows: song boundary # - If no metadata AND new song follows: song boundary (song without metadata) # - If metadata is present but NO new song follows: mid-song page break # (metadata appeared before the last page of the song) # - If no metadata and no new song follows: mid-song page break is_boundary = new_song_follows or (has_metadata and fi == len(footer_positions) - 1) # Special case: if this is the last footer, it always ends the song if fi == len(footer_positions) - 1: is_boundary = True if is_boundary: current_block_lines.extend(block_lines) refs = parse_reference_numbers( lines[footer_idx], lines[numbers_idx] if numbers_idx < len(lines) else "" ) song_blocks.append({ 'lines': current_block_lines, 'refs': refs, }) current_block_lines = [] else: # Mid-song page break - accumulate current_block_lines.extend(block_lines) current_block_start = numbers_idx + 1 # Handle remaining content if current_block_start and current_block_start < len(lines): remaining = lines[current_block_start:] current_block_lines.extend(remaining) if current_block_lines: song_blocks.append({ 'lines': current_block_lines, 'refs': SongReferences(), }) return song_blocks def extract_metadata_and_notes(block_lines: list): """Extract Worte, Weise, and commentary notes from the end of block lines. Returns (content_lines, lyrics_author, composer, notes). """ lyrics_author = None composer = None notes_lines = [] # Find Worte/Weise lines working backwards worte_idx = None weise_idx = None worte_und_weise_idx = None metadata_start = len(block_lines) for i in range(len(block_lines) - 1, max(-1, len(block_lines) - 50), -1): line = block_lines[i].strip() if not line: continue m = WORTE_UND_WEISE_RE.match(line) if m: worte_und_weise_idx = i val = m.group(1).strip() # Collect continuation lines j = i + 1 while j < len(block_lines): nl = block_lines[j].strip() if not nl or WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl): break val += " " + nl j += 1 lyrics_author = val composer = val metadata_start = i break mw = WORTE_RE.match(line) if mw and not WORTE_UND_WEISE_RE.match(line): if worte_idx is None: worte_idx = i val = mw.group(1).strip() j = i + 1 while j < len(block_lines): nl = block_lines[j].strip() if not nl or WEISE_RE.match(nl) or WORTE_RE.match(nl) or is_bulibu_header(nl): break val += " " + nl j += 1 lyrics_author = val if weise_idx is not None: metadata_start = min(worte_idx, weise_idx) break else: metadata_start = worte_idx continue mws = WEISE_RE.match(line) if mws: if weise_idx is None: weise_idx = i val = mws.group(1).strip() j = i + 1 while j < len(block_lines): nl = block_lines[j].strip() if not nl or WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl): break val += " " + nl j += 1 composer = val if worte_idx is not None: metadata_start = min(worte_idx, weise_idx) break else: metadata_start = weise_idx continue # Determine final metadata_start and metadata_end metadata_end = metadata_start if worte_und_weise_idx is not None: metadata_start = worte_und_weise_idx # Find end of the Worte und Weise block (including continuation lines) j = worte_und_weise_idx + 1 while j < len(block_lines): nl = block_lines[j].strip() if not nl: break if WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl): break j += 1 metadata_end = j elif worte_idx is not None and weise_idx is not None: metadata_start = min(worte_idx, weise_idx) metadata_end = max(worte_idx, weise_idx) + 1 # Include continuation lines after the later one later_idx = max(worte_idx, weise_idx) j = later_idx + 1 while j < len(block_lines): nl = block_lines[j].strip() if not nl: break if WORTE_RE.match(nl) or WEISE_RE.match(nl) or is_bulibu_header(nl): break j += 1 metadata_end = j elif worte_idx is not None: metadata_start = worte_idx j = worte_idx + 1 while j < len(block_lines): nl = block_lines[j].strip() if not nl: break j += 1 metadata_end = j elif weise_idx is not None: metadata_start = weise_idx j = weise_idx + 1 while j < len(block_lines): nl = block_lines[j].strip() if not nl: break j += 1 metadata_end = j # Content is everything EXCEPT the metadata lines. # If there's content AFTER the metadata, include it too. content_before = block_lines[:metadata_start] content_after = block_lines[metadata_end:] # Check if content_after has actual song content (not just whitespace) has_after_content = any(l.strip() for l in content_after) if has_after_content: content_lines = content_before + content_after else: content_lines = content_before # Look for commentary/notes between song content and metadata # Notes are prose paragraphs separated by blank lines from song content # Heuristic: look for a blank line gap, then check if text after it is prose last_content = len(content_lines) - 1 while last_content >= 0 and not content_lines[last_content].strip(): last_content -= 1 if last_content >= 0: # Search backwards for a blank-line boundary for j in range(last_content, 0, -1): if not content_lines[j].strip(): # Check if the block after this blank line is commentary candidate = [l.strip() for l in content_lines[j+1:last_content+1] if l.strip()] if candidate and len(candidate) >= 1: avg_len = sum(len(l) for l in candidate) / len(candidate) has_period = any('.' in l for l in candidate) if avg_len > 60 and has_period: notes_lines = candidate content_lines = content_lines[:j] break return content_lines, lyrics_author, composer, notes_lines def extract_title(content_lines: list) -> tuple: """Extract the song title from the beginning of content lines. The title appears on line(s) before the first chord line. Often the title is the same text as (or a prefix of) the first lyric line. Returns (title, remaining_lines). """ # Skip empty lines and form feeds at the start start = 0 while start < len(content_lines): line = content_lines[start].replace('\f', '').strip() if line: break start += 1 if start >= len(content_lines): return "", content_lines # Verse label patterns that should NOT be part of the title VERSE_LABEL_RE = re.compile(r'^(Ref\.?:|Refrain:?|Refr\.?:)', re.IGNORECASE) # Strategy: collect title lines until we hit a chord line or verse label. # The line BEFORE the first chord line is the last title line. title_lines = [] for i in range(start, len(content_lines)): line = content_lines[i].replace('\f', '').strip() if not line: if title_lines: remaining = content_lines[i:] return " ".join(title_lines), remaining continue if is_chord_line(line): if title_lines: remaining = content_lines[i:] return " ".join(title_lines), remaining else: return "", content_lines[start:] # If this line is a verse label (Ref.:, etc.), stop title collection if VERSE_LABEL_RE.match(line): if title_lines: remaining = content_lines[i:] return " ".join(title_lines), remaining # Verse label as first line - unusual; skip it continue # Check if the next non-empty, non-label line is a chord line next_chord = False for j in range(i + 1, min(i + 4, len(content_lines))): nxt = content_lines[j].replace('\f', '').strip() if nxt: if is_chord_line(nxt): next_chord = True elif VERSE_LABEL_RE.match(nxt): continue # Skip verse labels in lookahead break if next_chord and not title_lines: title_lines.append(line) remaining = content_lines[i + 1:] return " ".join(title_lines), remaining elif next_chord and title_lines: title_lines.append(line) remaining = content_lines[i + 1:] return " ".join(title_lines), remaining else: if not title_lines: title_lines.append(line) else: remaining = content_lines[i:] return " ".join(title_lines), remaining # Fell through - use whatever we collected if title_lines: return " ".join(title_lines), [] return "", content_lines[start:] def parse_song_block(block: dict) -> Optional[Song]: """Parse a raw song block into a Song object.""" block_lines = block['lines'] refs = block['refs'] cleaned_lines = [l.rstrip('\n').replace('\f', '') for l in block_lines] while cleaned_lines and not cleaned_lines[0].strip(): cleaned_lines.pop(0) while cleaned_lines and not cleaned_lines[-1].strip(): cleaned_lines.pop() if not cleaned_lines: return None content_lines, lyrics_author, composer, notes = extract_metadata_and_notes(cleaned_lines) if not content_lines: # Song might be metadata-only (e.g., Tischlieder, Trinksprüche) return None while content_lines and not content_lines[-1].strip(): content_lines.pop() while content_lines and not content_lines[0].strip(): content_lines.pop(0) title, song_lines = extract_title(content_lines) if not title: # Try to derive title from first lyric line # (Some songs start directly with chords) for l in song_lines: s = l.replace('\f', '').strip() if s and not is_chord_line(s): # Use first few words as title title = s if len(title) > 60: title = title[:60].rsplit(' ', 1)[0] break if not title: return None title = re.sub(r'\s+', ' ', title).strip() # Filter out non-song blocks # - Title is just a number (page number artifact) # - Title mentions "Illustration" (illustration page) if re.match(r'^\d+$', title): return None if 'Illustration' in title: return None song = Song( title=title, lyrics_author=lyrics_author, composer=composer, notes=notes, raw_lines=song_lines, refs=refs, ) return song # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser(description="Import songs from Carmina Leonis text extract") parser.add_argument("--input", default=INPUT_FILE, help="Path to extracted text file") parser.add_argument("--output-dir", default=OUTPUT_DIR, help="Output directory for .tex files") parser.add_argument("--all-songs", default=ALL_SONGS_FILE, help="Path for all-songs.tex") parser.add_argument("--dry-run", action="store_true", help="Parse but don't write files") args = parser.parse_args() print(f"Reading input from: {args.input}") lines = read_input(args.input) print(f"Read {len(lines)} lines") song_start = find_song_start(lines) print(f"Songs start at line {song_start + 1}") raw_blocks = split_into_raw_songs(lines, song_start) print(f"Found {len(raw_blocks)} raw song blocks") songs = [] skipped = 0 for block in raw_blocks: song = parse_song_block(block) if song: songs.append(song) else: skipped += 1 print(f"Parsed {len(songs)} songs ({skipped} blocks skipped)") if args.dry_run: for s in songs: cl = s.refs.cl or "?" print(f" CL {cl:>4s}: {s.title}") return os.makedirs(args.output_dir, exist_ok=True) written = 0 skipped_existing = 0 filenames = [] used_filenames = set() for song in songs: filename = sanitize_filename(song.title) if not filename: print(f" WARNING: Could not generate filename for '{song.title}', skipping") continue base_filename = filename counter = 2 while filename + ".tex" in used_filenames: filename = f"{base_filename}-{counter}" counter += 1 used_filenames.add(filename + ".tex") tex_filename = filename + ".tex" filepath = os.path.join(args.output_dir, tex_filename) if tex_filename in EXISTING_SONGS: print(f" SKIP (existing): {tex_filename}") skipped_existing += 1 filenames.append((filename, song.title)) continue tex_content = format_song_tex(song) with open(filepath, 'w', encoding='utf-8') as f: f.write(tex_content) filenames.append((filename, song.title)) written += 1 print(f"\nWrote {written} song files ({skipped_existing} existing songs preserved)") filenames.sort(key=lambda x: x[1].lower()) with open(args.all_songs, 'w', encoding='utf-8') as f: f.write("% Auto-generated list of all songs (alphabetical order)\n") f.write("% Generated by import-songs.py\n\n") for fname, title in filenames: f.write(f"\\input{{songs/{fname}}}\n") print(f"Generated {args.all_songs} with {len(filenames)} entries") with_chords = sum(1 for s in songs if any(is_chord_line(l.strip()) for l in s.raw_lines if l.strip())) with_worte = sum(1 for s in songs if s.lyrics_author) with_weise = sum(1 for s in songs if s.composer) with_cl = sum(1 for s in songs if s.refs.cl) print(f"\nStatistics:") print(f" Total songs: {len(songs)}") print(f" With chords: {with_chords}") print(f" With lyrics author (Worte): {with_worte}") print(f" With composer (Weise): {with_weise}") print(f" With CL number: {with_cl}") if __name__ == "__main__": main()