"""
extract_notes.py  –  Extract per-slide notes from a PPTX and write to CSV.

Usage:
    python3 extract_notes.py <pptx_path> [output.csv]

If output.csv is omitted, the CSV is written next to the PPTX with the
same base name (e.g. my_deck.pptx → my_deck_notes.csv).

CSV columns:
    Slide       – label such as "Slide 1", "Slide 2", …
    Transcript  – full notes-pane text for that slide (empty if none)
"""

import csv
import os
import re
import sys
import unicodedata
from pptx import Presentation


# ---------------------------------------------------------------------------
# Characters / ranges to map to a plain ASCII equivalent for TTS
# ---------------------------------------------------------------------------
_CHAR_MAP = {
    # Dashes → hyphen
    "\u2013": "-",   # en dash
    "\u2014": "-",   # em dash
    "\u2015": "-",   # horizontal bar
    # Ellipsis
    "\u2026": "...", # …
    # Smart / curly quotes → straight quotes
    "\u2018": "'",   # left single
    "\u2019": "'",   # right single / apostrophe
    "\u201a": "'",   # single low-9
    "\u201b": "'",   # single high-reversed-9
    "\u201c": '"',   # left double
    "\u201d": '"',   # right double
    "\u201e": '"',   # double low-9
    "\u201f": '"',   # double high-reversed-9
    # Bullets / list markers → space (TTS will just pause)
    "\u2022": " ",   # bullet •
    "\u2023": " ",   # triangular bullet
    "\u2043": " ",   # hyphen bullet
    "\u25aa": " ",   # small black square ▪
    "\u25b8": " ",   # small right-pointing triangle ▸
    "\u25cf": " ",   # black circle ●
    "\u00b7": " ",   # middle dot ·
    # Non-breaking space → regular space
    "\u00a0": " ",
    "\u202f": " ",   # narrow no-break space
    "\u2009": " ",   # thin space
    # Miscellaneous symbols that add no speech value
    "\u2122": "",    # ™
    "\u00ae": "",    # ®
    "\u00a9": "",    # ©
}


def clean_transcript(text: str) -> str:
    """
    Clean raw PPTX notes text for TTS / audio conversion:
      1. Replace known special characters with ASCII equivalents.
      2. Remove remaining non-printable / control characters.
      3. Replace all whitespace sequences (newlines, tabs, runs of spaces)
         with a single space.
      4. Strip leading / trailing whitespace.
    """
    if not text:
        return ""

    # 1. Map known special characters
    for char, replacement in _CHAR_MAP.items():
        text = text.replace(char, replacement)

    # 2. Decompose any remaining accented / composed characters to their
    #    ASCII base where possible (e.g. é → e), then drop leftovers.
    text = unicodedata.normalize("NFKD", text)
    text = text.encode("ascii", errors="ignore").decode("ascii")

    # 3. Remove any leftover control characters (except normal spaces)
    text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text)

    # 4. Collapse all whitespace (newlines, tabs, multiple spaces) → one space
    text = re.sub(r"\s+", " ", text)

    # 5. Tidy up punctuation spacing: remove space before , . : ; ! ?
    text = re.sub(r"\s+([,.:;!?])", r"\1", text)

    return text.strip()


def extract_slide_notes(slide):
    """Return the notes-pane text for a slide, or '' if none."""
    try:
        notes_slide = slide.notes_slide
        tf = notes_slide.notes_text_frame
        return tf.text.strip()
    except Exception:
        return ""


def extract_notes_to_csv(pptx_path, csv_path):
    prs = Presentation(pptx_path)
    rows = []

    for idx, slide in enumerate(prs.slides, start=1):
        raw_notes = extract_slide_notes(slide)
        notes = clean_transcript(raw_notes)
        rows.append({
            "Slide":      f"Slide {idx}",
            "Transcript": notes,
        })

    with open(csv_path, "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.DictWriter(f, fieldnames=["Slide", "Transcript"])
        writer.writeheader()
        writer.writerows(rows)

    return rows


def main():
    if len(sys.argv) < 2:
        sys.stderr.write(
            "Usage: python3 extract_notes.py <pptx_path> [output.csv]\n"
        )
        sys.exit(1)

    pptx_path = sys.argv[1]
    if not os.path.exists(pptx_path):
        sys.stderr.write(f"Error: file not found: {pptx_path}\n")
        sys.exit(1)

    if len(sys.argv) >= 3:
        csv_path = sys.argv[2]
    else:
        base = os.path.splitext(pptx_path)[0]
        csv_path = base + "_notes.csv"

    rows = extract_notes_to_csv(pptx_path, csv_path)

    print(f"Wrote {len(rows)} slide(s) to: {csv_path}")
    for r in rows:
        has_notes = "Y" if r["Transcript"] else "-"
        print(f"  {r['Slide']:>8}  [{has_notes}]")


if __name__ == "__main__":
    main()
