#!/usr/bin/env python3
"""
Parses the raw JSONL conversation transcript into a structured timeline.
Produces parsed.json with chapter metadata, session boundaries, and clean message content.

Note: Expects raw.jsonl in the same directory. This file is not included in the
publication for privacy reasons (it contains local file paths and session metadata).
To regenerate, copy your Claude Code session transcript here as raw.jsonl first.

Usage: python3 parser.py
"""

import json
import re
from datetime import datetime, timedelta
from pathlib import Path

RAW_PATH = Path(__file__).parent / "raw.jsonl"
OUTPUT_PATH = Path(__file__).parent / "parsed.json"

# Phase keywords to auto-detect conversation phases
PHASE_MARKERS = {
    "discovery": [
        "problem understanding", "feature scoping", "flow mapping",
        "design brief", "personas", "MoSCoW", "must have", "should have",
        "pain point", "core problem", "who experiences",
    ],
    "information-architecture": [
        "sitemap", "wireframe", "screen", "responsive strategy",
        "information architecture", "reflow", "divergent", "navigation",
    ],
    "style-direction": [
        "style direction", "token", "wayfinder", "cartographer", "basecamp",
        "signal", "color palette", "typography", "Satoshi", "design tokens",
    ],
    "component-generation": [
        "component", "variant", "preview", "navbar", "hero section",
        "how it works", "expedition card", "progress bar", "filter bar",
        "promote_component", "save_component",
    ],
    "page-assembly": [
        "page assembly", "LandingPage", "ExpeditionPage", "ArchivePage",
        "save_page", "complete_assembly", "stage 5",
    ],
    "self-evaluation": [
        "self-evaluat", "self evaluat", "checklist", "audit",
        "overflow", "touch target", "completeness check",
    ],
}


def parse_content(message_data):
    """Extract readable text content from a message."""
    msg = message_data.get("message", message_data)

    if isinstance(msg, str):
        return msg

    if isinstance(msg, dict):
        content = msg.get("content", "")
    else:
        return ""

    if isinstance(content, str):
        return content

    if isinstance(content, list):
        texts = []
        tools = []
        for block in content:
            if isinstance(block, dict):
                if block.get("type") == "text":
                    texts.append(block.get("text", ""))
                elif block.get("type") == "tool_use":
                    tools.append(block.get("name", "unknown"))
                elif block.get("type") == "tool_result":
                    pass  # skip tool results in display
        return {
            "text": "\n\n".join(texts) if texts else "",
            "tools_used": tools,
        }

    return ""


def is_meta_message(content_str):
    """Check if a message is system/meta noise we should filter."""
    if not content_str:
        return True
    meta_patterns = [
        "<local-command-caveat>",
        "<command-name>",
        "<local-command-stdout>",
        "API Error:",
        "Login interrupted",
    ]
    return any(p in content_str for p in meta_patterns)


def detect_phase(text, index, total):
    """Detect which phase a message belongs to based on content and position."""
    text_lower = text.lower() if text else ""
    progress = index / max(total, 1)

    # Position-based heuristic + keyword matching
    scores = {}
    for phase, keywords in PHASE_MARKERS.items():
        score = sum(1 for kw in keywords if kw.lower() in text_lower)
        scores[phase] = score

    best = max(scores, key=scores.get) if max(scores.values()) > 0 else None

    if best:
        return best

    # Fallback to position-based assignment
    if progress < 0.15:
        return "discovery"
    elif progress < 0.30:
        return "information-architecture"
    elif progress < 0.45:
        return "style-direction"
    elif progress < 0.75:
        return "component-generation"
    elif progress < 0.90:
        return "page-assembly"
    else:
        return "self-evaluation"


def detect_decision_point(text):
    """Check if a message contains a key decision."""
    if not text:
        return False
    indicators = [
        "approved", "lock it in", "let's go with", "I choose",
        "I like", "I'm leaning towards", "that's the one",
        "yes", "let's do", "ship it", "ready to",
        "BRIEF", "FEATURES", "FLOWS",  # artifact presentations
    ]
    return any(ind.lower() in text.lower() for ind in indicators)


def detect_sessions(messages):
    """Group messages into sessions based on timestamp gaps > 30 min."""
    if not messages:
        return messages

    session = 1
    last_ts = None

    for msg in messages:
        ts_str = msg.get("timestamp", "")
        if ts_str:
            try:
                ts = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
                if last_ts and (ts - last_ts) > timedelta(minutes=30):
                    session += 1
                last_ts = ts
            except (ValueError, TypeError):
                pass
        msg["session"] = session

    return messages


def build_chapters(messages):
    """Build chapter metadata from the parsed messages."""
    phase_groups = {}
    for msg in messages:
        phase = msg.get("phase", "unknown")
        if phase not in phase_groups:
            phase_groups[phase] = []
        phase_groups[phase].append(msg)

    chapter_order = [
        ("discovery", "Discovery", "Cold start to design brief — problem understanding, persona definition, feature prioritization, flow mapping"),
        ("information-architecture", "Information Architecture", "Sitemap, screen extraction, wireframes, responsive strategy"),
        ("style-direction", "Style Direction", "Token exploration, candidate comparison, Wayfinder Evolved selection"),
        ("component-generation", "Component Iteration", "19 components across 3 pages — variant generation, review, and approval"),
        ("page-assembly", "Page Assembly", "Composing components into full pages at desktop and mobile breakpoints"),
        ("self-evaluation", "The Self-Evaluation Gap", "The meta-lesson: good process for decisions doesn't guarantee good output"),
    ]

    chapters = []
    for phase_id, title, subtitle in chapter_order:
        msgs = phase_groups.get(phase_id, [])
        if not msgs:
            continue

        decisions = [m for m in msgs if m.get("is_decision_point")]
        sessions = list(set(m.get("session", 0) for m in msgs))

        chapters.append({
            "id": phase_id,
            "title": title,
            "subtitle": subtitle,
            "message_count": len(msgs),
            "first_index": msgs[0]["index"],
            "last_index": msgs[-1]["index"],
            "sessions": sorted(sessions),
            "decision_count": len(decisions),
        })

    return chapters


def main():
    messages = []
    raw_lines = RAW_PATH.read_text().strip().split("\n")

    for line in raw_lines:
        try:
            data = json.loads(line)
        except json.JSONDecodeError:
            continue

        msg_type = data.get("type")
        if msg_type not in ("user", "assistant"):
            continue

        parsed = parse_content(data)

        if isinstance(parsed, dict):
            text = parsed.get("text", "")
            tools = parsed.get("tools_used", [])
        else:
            text = parsed
            tools = []

        if is_meta_message(text):
            continue

        # Strip system-reminder tags from display content
        text = re.sub(r"<system-reminder>.*?</system-reminder>", "", text, flags=re.DOTALL).strip()

        if not text and not tools:
            continue

        messages.append({
            "index": len(messages),
            "type": msg_type,
            "timestamp": data.get("timestamp", ""),
            "content": text,
            "tools_used": tools if msg_type == "assistant" else [],
            "is_decision_point": detect_decision_point(text) if msg_type == "user" else False,
        })

    # Detect phases
    total = len(messages)
    for msg in messages:
        msg["phase"] = detect_phase(msg["content"], msg["index"], total)

    # Detect sessions
    messages = detect_sessions(messages)

    # Build chapters
    chapters = build_chapters(messages)

    # Summary stats
    user_count = sum(1 for m in messages if m["type"] == "user")
    assistant_count = sum(1 for m in messages if m["type"] == "assistant")
    session_count = max((m.get("session", 1) for m in messages), default=1)

    output = {
        "meta": {
            "project": "Unreal Expeditions",
            "description": "Full design process conversation: discovery through page assembly",
            "total_messages": len(messages),
            "user_messages": user_count,
            "assistant_messages": assistant_count,
            "sessions": session_count,
            "first_timestamp": messages[0]["timestamp"] if messages else "",
            "last_timestamp": messages[-1]["timestamp"] if messages else "",
        },
        "chapters": chapters,
        "messages": messages,
    }

    OUTPUT_PATH.write_text(json.dumps(output, indent=2))
    print(f"Parsed {len(messages)} messages into {len(chapters)} chapters across {session_count} sessions")
    print(f"Output: {OUTPUT_PATH}")


if __name__ == "__main__":
    main()
