#!/usr/bin/env python3 """ Parses the raw JSONL conversation transcript into a structured timeline. Produces parsed.json with chapter metadata, session boundaries, and clean message content. Note: Expects raw.jsonl in the same directory. This file is not included in the publication for privacy reasons (it contains local file paths and session metadata). To regenerate, copy your Claude Code session transcript here as raw.jsonl first. Usage: python3 parser.py """ import json import re from datetime import datetime, timedelta from pathlib import Path RAW_PATH = Path(__file__).parent / "raw.jsonl" OUTPUT_PATH = Path(__file__).parent / "parsed.json" # Phase keywords to auto-detect conversation phases PHASE_MARKERS = { "discovery": [ "problem understanding", "feature scoping", "flow mapping", "design brief", "personas", "MoSCoW", "must have", "should have", "pain point", "core problem", "who experiences", ], "information-architecture": [ "sitemap", "wireframe", "screen", "responsive strategy", "information architecture", "reflow", "divergent", "navigation", ], "style-direction": [ "style direction", "token", "wayfinder", "cartographer", "basecamp", "signal", "color palette", "typography", "Satoshi", "design tokens", ], "component-generation": [ "component", "variant", "preview", "navbar", "hero section", "how it works", "expedition card", "progress bar", "filter bar", "promote_component", "save_component", ], "page-assembly": [ "page assembly", "LandingPage", "ExpeditionPage", "ArchivePage", "save_page", "complete_assembly", "stage 5", ], "self-evaluation": [ "self-evaluat", "self evaluat", "checklist", "audit", "overflow", "touch target", "completeness check", ], } def parse_content(message_data): """Extract readable text content from a message.""" msg = message_data.get("message", message_data) if isinstance(msg, str): return msg if isinstance(msg, dict): content = msg.get("content", "") else: return "" if isinstance(content, str): return content if isinstance(content, list): texts = [] tools = [] for block in content: if isinstance(block, dict): if block.get("type") == "text": texts.append(block.get("text", "")) elif block.get("type") == "tool_use": tools.append(block.get("name", "unknown")) elif block.get("type") == "tool_result": pass # skip tool results in display return { "text": "\n\n".join(texts) if texts else "", "tools_used": tools, } return "" def is_meta_message(content_str): """Check if a message is system/meta noise we should filter.""" if not content_str: return True meta_patterns = [ "", "", "", "API Error:", "Login interrupted", ] return any(p in content_str for p in meta_patterns) def detect_phase(text, index, total): """Detect which phase a message belongs to based on content and position.""" text_lower = text.lower() if text else "" progress = index / max(total, 1) # Position-based heuristic + keyword matching scores = {} for phase, keywords in PHASE_MARKERS.items(): score = sum(1 for kw in keywords if kw.lower() in text_lower) scores[phase] = score best = max(scores, key=scores.get) if max(scores.values()) > 0 else None if best: return best # Fallback to position-based assignment if progress < 0.15: return "discovery" elif progress < 0.30: return "information-architecture" elif progress < 0.45: return "style-direction" elif progress < 0.75: return "component-generation" elif progress < 0.90: return "page-assembly" else: return "self-evaluation" def detect_decision_point(text): """Check if a message contains a key decision.""" if not text: return False indicators = [ "approved", "lock it in", "let's go with", "I choose", "I like", "I'm leaning towards", "that's the one", "yes", "let's do", "ship it", "ready to", "BRIEF", "FEATURES", "FLOWS", # artifact presentations ] return any(ind.lower() in text.lower() for ind in indicators) def detect_sessions(messages): """Group messages into sessions based on timestamp gaps > 30 min.""" if not messages: return messages session = 1 last_ts = None for msg in messages: ts_str = msg.get("timestamp", "") if ts_str: try: ts = datetime.fromisoformat(ts_str.replace("Z", "+00:00")) if last_ts and (ts - last_ts) > timedelta(minutes=30): session += 1 last_ts = ts except (ValueError, TypeError): pass msg["session"] = session return messages def build_chapters(messages): """Build chapter metadata from the parsed messages.""" phase_groups = {} for msg in messages: phase = msg.get("phase", "unknown") if phase not in phase_groups: phase_groups[phase] = [] phase_groups[phase].append(msg) chapter_order = [ ("discovery", "Discovery", "Cold start to design brief — problem understanding, persona definition, feature prioritization, flow mapping"), ("information-architecture", "Information Architecture", "Sitemap, screen extraction, wireframes, responsive strategy"), ("style-direction", "Style Direction", "Token exploration, candidate comparison, Wayfinder Evolved selection"), ("component-generation", "Component Iteration", "19 components across 3 pages — variant generation, review, and approval"), ("page-assembly", "Page Assembly", "Composing components into full pages at desktop and mobile breakpoints"), ("self-evaluation", "The Self-Evaluation Gap", "The meta-lesson: good process for decisions doesn't guarantee good output"), ] chapters = [] for phase_id, title, subtitle in chapter_order: msgs = phase_groups.get(phase_id, []) if not msgs: continue decisions = [m for m in msgs if m.get("is_decision_point")] sessions = list(set(m.get("session", 0) for m in msgs)) chapters.append({ "id": phase_id, "title": title, "subtitle": subtitle, "message_count": len(msgs), "first_index": msgs[0]["index"], "last_index": msgs[-1]["index"], "sessions": sorted(sessions), "decision_count": len(decisions), }) return chapters def main(): messages = [] raw_lines = RAW_PATH.read_text().strip().split("\n") for line in raw_lines: try: data = json.loads(line) except json.JSONDecodeError: continue msg_type = data.get("type") if msg_type not in ("user", "assistant"): continue parsed = parse_content(data) if isinstance(parsed, dict): text = parsed.get("text", "") tools = parsed.get("tools_used", []) else: text = parsed tools = [] if is_meta_message(text): continue # Strip system-reminder tags from display content text = re.sub(r".*?", "", text, flags=re.DOTALL).strip() if not text and not tools: continue messages.append({ "index": len(messages), "type": msg_type, "timestamp": data.get("timestamp", ""), "content": text, "tools_used": tools if msg_type == "assistant" else [], "is_decision_point": detect_decision_point(text) if msg_type == "user" else False, }) # Detect phases total = len(messages) for msg in messages: msg["phase"] = detect_phase(msg["content"], msg["index"], total) # Detect sessions messages = detect_sessions(messages) # Build chapters chapters = build_chapters(messages) # Summary stats user_count = sum(1 for m in messages if m["type"] == "user") assistant_count = sum(1 for m in messages if m["type"] == "assistant") session_count = max((m.get("session", 1) for m in messages), default=1) output = { "meta": { "project": "Unreal Expeditions", "description": "Full design process conversation: discovery through page assembly", "total_messages": len(messages), "user_messages": user_count, "assistant_messages": assistant_count, "sessions": session_count, "first_timestamp": messages[0]["timestamp"] if messages else "", "last_timestamp": messages[-1]["timestamp"] if messages else "", }, "chapters": chapters, "messages": messages, } OUTPUT_PATH.write_text(json.dumps(output, indent=2)) print(f"Parsed {len(messages)} messages into {len(chapters)} chapters across {session_count} sessions") print(f"Output: {OUTPUT_PATH}") if __name__ == "__main__": main()