agi-lecture-dreamer-gepa/proposal/index.html

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Dreamer-JEPA: Foundation World Models</title>

    <!-- React & ReactDOM -->
    <script crossorigin src="https://unpkg.com/react@18/umd/react.development.js"></script>
    <script crossorigin src="https://unpkg.com/react-dom@18/umd/react-dom.development.js"></script>

    <!-- Babel -->
    <script src="https://unpkg.com/@babel/standalone/babel.min.js"></script>

    <!-- Tailwind CSS -->
    <script src="https://cdn.tailwindcss.com"></script>

    <!-- Google Fonts -->
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;800&family=JetBrains+Mono:wght@400;700&display=swap" rel="stylesheet">

    <style>
        body {
            font-family: 'Inter', sans-serif;
            background-color: #0f172a; /* Slate 900 */
            color: #e2e8f0; /* Slate 200 */
        }
        .font-mono {
            font-family: 'JetBrains Mono', monospace;
        }
        .gradient-text {
            background: linear-gradient(to right, #38bdf8, #818cf8);
            -webkit-background-clip: text;
            -webkit-text-fill-color: transparent;
        }
        .glass-panel {
            background: rgba(30, 41, 59, 0.7);
            backdrop-filter: blur(10px);
            border: 1px solid rgba(255, 255, 255, 0.1);
        }
        /* Custom diagram animations */
        .flow-line {
            stroke-dasharray: 10;
            animation: dash 1s linear infinite;
        }
        @keyframes dash {
            to {
                stroke-dashoffset: -20;
            }
        }
    </style>
</head>
<body>
    <div id="root"></div>

    <script type="text/babel">
        const { useState, useEffect } = React;

        // --- Icons (Inline SVGs to avoid CDN issues) ---
        const IconBase = ({ children, size = 24, className = "" }) => (
            <svg
                xmlns="http://www.w3.org/2000/svg"
                width={size}
                height={size}
                viewBox="0 0 24 24"
                fill="none"
                stroke="currentColor"
                strokeWidth="2"
                strokeLinecap="round"
                strokeLinejoin="round"
                className={className}
            >
                {children}
            </svg>
        );

        const Icons = {
            Brain: (props) => (
                <IconBase {...props}>
                    <path d="M9.5 2A2.5 2.5 0 0 1 12 4.5v15a2.5 2.5 0 0 1-4.96.44 2.5 2.5 0 0 1-2.96-3.08 3 3 0 0 1-.34-5.58 2.5 2.5 0 0 1 1.32-4.24 2.5 2.5 0 0 1 1.98-3A2.5 2.5 0 0 1 9.5 2Z" />
                    <path d="M14.5 2A2.5 2.5 0 0 0 12 4.5v15a2.5 2.5 0 0 0 4.96.44 2.5 2.5 0 0 0 2.96-3.08 3 3 0 0 0 .34-5.58 2.5 2.5 0 0 0-1.32-4.24 2.5 2.5 0 0 0-1.98-3A2.5 2.5 0 0 0 14.5 2Z" />
                </IconBase>
            ),
            ArrowRight: (props) => (
                <IconBase {...props}>
                    <line x1="5" y1="12" x2="19" y2="12" />
                    <polyline points="12 5 19 12 12 19" />
                </IconBase>
            ),
            Eye: (props) => (
                <IconBase {...props}>
                    <path d="M2 12s3-7 10-7 10 7 10 7-3 7-10 7-10-7-10-7Z" />
                    <circle cx="12" cy="12" r="3" />
                </IconBase>
            ),
            Database: (props) => (
                <IconBase {...props}>
                    <ellipse cx="12" cy="5" rx="9" ry="3" />
                    <path d="M21 12c0 1.66-4 3-9 3s-9-1.34-9-3" />
                    <path d="M3 5v14c0 1.66 4 3 9 3s9-1.34 9-3V5" />
                </IconBase>
            ),
            Zap: (props) => (
                <IconBase {...props}>
                    <polygon points="13 2 3 14 12 14 11 22 21 10 12 10 13 2" />
                </IconBase>
            ),
            Layers: (props) => (
                <IconBase {...props}>
                    <polygon points="12 2 2 7 12 12 22 7 12 2" />
                    <polyline points="2 17 12 22 22 17" />
                    <polyline points="2 12 12 17 22 12" />
                </IconBase>
            ),
            AlertTriangle: (props) => (
                <IconBase {...props}>
                    <path d="m21.73 18-8-14a2 2 0 0 0-3.48 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z" />
                    <line x1="12" y1="9" x2="12" y2="13" />
                    <line x1="12" y1="17" x2="12.01" y2="17" />
                </IconBase>
            ),
            FileText: (props) => (
                <IconBase {...props}>
                    <path d="M14.5 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V7.5L14.5 2z" />
                    <polyline points="14 2 14 8 20 8" />
                    <line x1="16" y1="13" x2="8" y2="13" />
                    <line x1="16" y1="17" x2="8" y2="17" />
                    <line x1="10" y1="9" x2="8" y2="9" />
                </IconBase>
            ),
            Github: (props) => (
                <IconBase {...props}>
                    <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22" />
                </IconBase>
            )
        };

        const { ArrowRight, Brain, Eye, Database, Zap, Layers, AlertTriangle, FileText, Github } = Icons;

        // --- Components ---

        const Nav = () => (
            <nav className="fixed top-0 w-full z-50 glass-panel border-b border-slate-700/50">
                <div className="max-w-6xl mx-auto px-6 h-16 flex items-center justify-between">
                    <div className="font-bold text-xl tracking-tight flex items-center gap-2">
                        <Brain className="text-cyan-400" size={24} />
                        <span>Dreamer<span className="text-cyan-400">-JEPA</span></span>
                    </div>
                    <div className="hidden md:flex space-x-8 text-sm font-medium text-slate-400">
                        <a href="#abstract" className="hover:text-white transition">Abstract</a>
                        <a href="#architecture" className="hover:text-white transition">Architecture</a>
                        <a href="#challenges" className="hover:text-white transition">Challenges</a>
                    </div>
                </div>
            </nav>
        );

        const Hero = () => (
            <section className="pt-32 pb-20 px-6 max-w-5xl mx-auto text-center">
                <div className="inline-block mb-4 px-3 py-1 rounded-full bg-cyan-900/30 border border-cyan-500/30 text-cyan-400 text-xs font-mono uppercase tracking-widest">
                    Research Proposal
                </div>
                <h1 className="text-5xl md:text-7xl font-extrabold mb-6 tracking-tight leading-tight">
                    Dreaming in <br className="hidden md:block" />
                    <span className="gradient-text">Concept Space</span>
                </h1>
                <p className="text-xl md:text-2xl text-slate-400 max-w-3xl mx-auto leading-relaxed mb-10">
                    Accelerating World Models by replacing pixel reconstruction with
                    Pretrained Joint-Embedding Predictive Architectures (JEPA).
                </p>
                <div className="flex flex-wrap justify-center gap-4">
                    <button className="flex items-center gap-2 bg-slate-100 text-slate-900 px-6 py-3 rounded-lg font-semibold hover:bg-white transition">
                        <FileText size={18} /> Read Proposal
                    </button>
                    <button className="flex items-center gap-2 bg-slate-800 text-white px-6 py-3 rounded-lg font-semibold hover:bg-slate-700 transition border border-slate-700">
                        <Github size={18} /> View Code
                    </button>
                </div>
            </section>
        );

        const Abstract = () => (
            <section id="abstract" className="py-12 px-6 bg-slate-800/50 border-y border-slate-700/50">
                <div className="max-w-4xl mx-auto">
                    <h2 className="text-sm font-bold text-slate-500 uppercase tracking-widest mb-4">Abstract</h2>
                    <p className="text-lg text-slate-300 leading-relaxed text-justify">
                        We propose replacing the standard trainable CNN encoder in the Dreamer architecture with a massive, pretrained Joint-Embedding Predictive Architecture (JEPA), such as Meta's V-JEPA. Instead of learning to see from scratch, the agent utilizes the frozen JEPA backbone to instantly process raw visual observations into rich, semantic latent embeddings. The Dreamer’s Recurrent State Space Model (RSSM) consumes these embeddings to learn dynamics, effectively "dreaming" in high-level concept space. This eliminates the computationally expensive image decoder and pixel-reconstruction loss, focusing purely on predictive modeling.
                    </p>
                </div>
            </section>
        );

        const Motivation = () => (
            <section className="py-20 px-6 max-w-6xl mx-auto">
                <div className="max-w-5xl mx-auto">
                    <h2 className="text-3xl font-bold mb-8 text-center">The Bottleneck in Standard Dreamer</h2>

                    <div className="glass-panel rounded-2xl p-8 md:p-12 mb-8">
                        <img
                            src="dreamer-basic-arch.png"
                            alt="Dreamer Architecture Comparison"
                            className="w-full rounded-lg mb-6"
                        />
                        <div className="text-center text-xs text-slate-500 mb-8">
                            Figure: (a) World Model Learning phase requires training encoder/decoder for reconstruction,
                            (b) Actor-Critic Learning can only begin after world model converges<br />
                            Source: <a href="https://github.com/danijar/dreamerv3" target="_blank" rel="noopener noreferrer" className="text-cyan-400 hover:text-cyan-300 underline">Dreamer-V3 GitHub</a>
                        </div>
                    </div>

                    <div className="grid md:grid-cols-2 gap-6">
                        <div className="bg-rose-900/20 border border-rose-500/30 rounded-xl p-6">
                            <h3 className="text-lg font-bold text-rose-400 mb-3 flex items-center gap-2">
                                <span className="text-2xl">⚠️</span> The Problem
                            </h3>
                            <p className="text-slate-300 text-sm leading-relaxed">
                                Dreamer-V3 must first train its <strong>Encoder</strong> and <strong>Decoder</strong> networks
                                to accurately reconstruct pixel-level observations. This reconstruction objective delays the
                                actual <strong>Actor-Critic training</strong>, requiring millions of environment steps before
                                the world model produces useful latent representations. The decoder alone adds substantial
                                computational overhead while modeling irrelevant visual details.
                            </p>
                        </div>

                        <div className="bg-cyan-900/20 border border-cyan-500/30 rounded-xl p-6">
                            <h3 className="text-lg font-bold text-cyan-400 mb-3 flex items-center gap-2">
                                <span className="text-2xl">✨</span> The V-JEPA Solution
                            </h3>
                            <p className="text-slate-300 text-sm leading-relaxed">
                                By replacing the trainable encoder with a <strong>frozen V-JEPA backbone</strong>, we eliminate
                                the need for pixel reconstruction entirely. This dramatically reduces trainable parameters
                                (no encoder training, no decoder needed), saving compute while potentially <strong>increasing
                                generalization</strong> due to V-JEPA's pretraining on millions of diverse videos. The agent
                                can immediately leverage "adult-level" visual understanding.
                            </p>
                        </div>
                    </div>
                </div>
            </section>
        );

        const ArchitectureViewer = () => {
            const [mode, setMode] = useState('jepa'); // 'standard' or 'jepa'

            return (
                <section id="architecture" className="py-24 px-6 max-w-6xl mx-auto">
                    <div className="flex flex-col md:flex-row justify-between items-end mb-12">
                        <div>
                            <h2 className="text-3xl font-bold mb-4">Architecture Comparison</h2>
                            <p className="text-slate-400 max-w-lg">
                                Toggle to see how JEPA integration streamlines the World Model pipeline compared to the standard approach.
                            </p>
                        </div>
                        <div className="bg-slate-800 p-1 rounded-lg inline-flex mt-6 md:mt-0">
                            <button
                                onClick={() => setMode('standard')}
                                className={`px-4 py-2 rounded-md text-sm font-medium transition ${mode === 'standard' ? 'bg-slate-600 text-white shadow-lg' : 'text-slate-400 hover:text-white'}`}
                            >
                                Standard Dreamer
                            </button>
                            <button
                                onClick={() => setMode('jepa')}
                                className={`px-4 py-2 rounded-md text-sm font-medium transition ${mode === 'jepa' ? 'bg-cyan-600 text-white shadow-lg' : 'text-slate-400 hover:text-white'}`}
                            >
                                Proposed (JEPA)
                            </button>
                        </div>
                    </div>

                    {/* Diagram Container */}
                    <div className="glass-panel rounded-2xl p-8 md:p-12 relative overflow-hidden min-h-[400px] flex flex-col justify-center">

                        {/* Pipeline Visualization */}
                        <div className="relative z-10 flex flex-col md:flex-row gap-4 items-center justify-center">

                            {/* Input */}
                            <div className="flex flex-col items-center">
                                <div className="w-24 h-24 bg-slate-900 border-2 border-slate-700 rounded-xl flex items-center justify-center mb-4 relative">
                                    <Eye className="text-slate-500" size={32} />
                                    <div className="absolute -bottom-2 bg-slate-800 text-[10px] px-2 py-0.5 rounded text-slate-400">Obs (x)</div>
                                </div>
                                <span className="text-xs font-mono text-slate-500 uppercase">Input</span>
                            </div>

                            <ArrowRight className="hidden md:block text-slate-600 mx-auto" />

                            {/* Encoder Stage */}
                            <div className={`p-6 rounded-xl border-2 transition-all duration-500 flex flex-col items-center text-center relative ${
                                mode === 'standard'
                                ? 'border-rose-500/30 bg-rose-500/10'
                                : 'border-cyan-500/30 bg-cyan-500/10'
                            }`}>
                                {mode === 'jepa' && (
                                    <div className="absolute -top-3 right-4 bg-cyan-600 text-white text-[10px] font-bold px-2 py-1 rounded-full">
                                        FROZEN
                                    </div>
                                )}
                                <Layers className={`mb-3 ${mode === 'standard' ? 'text-rose-400' : 'text-cyan-400'}`} size={32} />
                                <h3 className="font-bold mb-1">
                                    {mode === 'standard' ? 'Trainable CNN' : 'V-JEPA Backbone'}
                                </h3>
                                <p className="text-xs opacity-70 mb-2">
                                    {mode === 'standard' ? 'Learns from scratch' : 'Pretrained on millions of videos'}
                                </p>
                                {mode === 'jepa' && (
                                    <div className="mt-2 bg-cyan-900/50 px-2 py-1 rounded text-[10px] border border-cyan-500/30 text-cyan-300 w-full">
                                        + Lightweight Adapters
                                    </div>
                                )}
                            </div>

                            <ArrowRight className="hidden md:block text-slate-600 mx-auto" />

                            {/* World Model Stage */}
                            <div className="p-6 rounded-xl border-2 border-indigo-500/30 bg-indigo-500/10 flex flex-col items-center text-center">
                                <Brain className="text-indigo-400 mb-3" size={32} />
                                <h3 className="font-bold mb-1">RSSM</h3>
                                <p className="text-xs opacity-70">
                                    Recurrent State Space Model
                                </p>
                                <div className="mt-4 w-full h-1 bg-indigo-900/50 rounded-full overflow-hidden">
                                    <div className="h-full bg-indigo-500 animate-pulse w-2/3"></div>
                                </div>
                                <span className="text-[10px] mt-1 text-indigo-300">Latent Dynamics</span>
                            </div>

                        </div>

                        {/* Reconstruction / Objective Branch (Bottom) */}
                        <div className="mt-12 pt-8 border-t border-dashed border-slate-700 relative">
                             <div className="absolute top-0 left-1/2 -translate-x-1/2 -translate-y-1/2 bg-slate-900 px-4 text-xs text-slate-500 font-mono">
                                OBJECTIVE FUNCTION
                            </div>

                            {mode === 'standard' ? (
                                <div className="flex flex-col items-center animate-in fade-in duration-500">
                                    <div className="h-12 w-0.5 bg-rose-500/50 mb-2"></div>
                                    <div className="border border-rose-500/50 bg-rose-900/10 p-4 rounded-lg text-center max-w-xs">
                                        <h4 className="text-rose-400 font-bold text-sm">Image Decoder</h4>
                                        <p className="text-xs text-slate-400 mt-1">Reconstructs pixels (Expensive)</p>
                                        <div className="mt-2 font-mono text-[10px] text-rose-300">Loss: ||x - x̂||²</div>
                                    </div>
                                </div>
                            ) : (
                                <div className="flex flex-col items-center animate-in fade-in duration-500">
                                    <div className="h-12 w-0.5 bg-cyan-500/50 mb-2"></div>
                                    <div className="border border-cyan-500/50 bg-cyan-900/10 p-4 rounded-lg text-center max-w-xs">
                                        <h4 className="text-cyan-400 font-bold text-sm">Latent Prediction</h4>
                                        <p className="text-xs text-slate-400 mt-1">Predicts future features (Efficient)</p>
                                        <div className="mt-2 font-mono text-[10px] text-cyan-300">Loss: Latent Space Only</div>
                                    </div>
                                </div>
                            )}
                        </div>
                    </div>
                </section>
            );
        };

        const Features = () => (
            <section className="py-20 bg-slate-900">
                <div className="max-w-6xl mx-auto px-6 grid grid-cols-1 md:grid-cols-3 gap-8">
                    <div className="p-6 bg-slate-800 rounded-xl border border-slate-700 hover:border-cyan-500/50 transition duration-300">
                        <div className="w-12 h-12 bg-cyan-900/50 rounded-lg flex items-center justify-center mb-4">
                            <Zap className="text-cyan-400" />
                        </div>
                        <h3 className="text-xl font-bold mb-2">Immediate Robustness</h3>
                        <p className="text-slate-400 leading-relaxed">
                            Leveraging foundation models trained on vast datasets, the agent handles visual noise (leaves, static) and complex geometry instantly, bypassing the millions of steps usually needed for visual convergence.
                        </p>
                    </div>

                    <div className="p-6 bg-slate-800 rounded-xl border border-slate-700 hover:border-purple-500/50 transition duration-300">
                        <div className="w-12 h-12 bg-purple-900/50 rounded-lg flex items-center justify-center mb-4">
                            <Database className="text-purple-400" />
                        </div>
                        <h3 className="text-xl font-bold mb-2">Mathematical Synergy</h3>
                        <p className="text-slate-400 leading-relaxed">
                            V-JEPA's pretraining objective—predicting latent representations of future frames—aligns perfectly with Dreamer's goal of predicting future world states, creating a unified predictive pipeline.
                        </p>
                    </div>

                    <div className="p-6 bg-slate-800 rounded-xl border border-slate-700 hover:border-green-500/50 transition duration-300">
                        <div className="w-12 h-12 bg-green-900/50 rounded-lg flex items-center justify-center mb-4">
                            <Layers className="text-green-400" />
                        </div>
                        <h3 className="text-xl font-bold mb-2">No Decoder</h3>
                        <p className="text-slate-400 leading-relaxed">
                            By removing the need to reconstruct pixels, we eliminate the modeling of irrelevant details. The model "dreams" strictly in concepts, significantly reducing computational overhead.
                        </p>
                    </div>
                </div>
            </section>
        );

        const Challenges = () => (
            <section id="challenges" className="py-24 px-6 max-w-5xl mx-auto">
                <h2 className="text-3xl font-bold mb-8 text-center">Critical Challenges & Risks</h2>

                <div className="space-y-6">
                    {/* Challenge 1: Red Light Problem */}
                    <div className="bg-gradient-to-br from-amber-900/20 to-slate-800/50 border border-amber-500/20 rounded-2xl p-8 md:p-12">
                        <div className="flex items-start gap-6">
                            <div className="bg-amber-500/10 p-3 rounded-full hidden md:block">
                                <AlertTriangle className="text-amber-500" size={32} />
                            </div>
                            <div>
                                <h3 className="text-2xl font-bold mb-4 text-amber-500">Challenge 1: The "Red Light" Problem</h3>
                                <p className="text-slate-300 mb-6 leading-relaxed">
                                    A significant risk of using a completely frozen encoder is the potential filtering of tiny, task-relevant details. A small red light might be statistically insignificant in general internet video data (V-JEPA's training set) but critical for a specific RL task (e.g., a braking signal).
                                </p>

                                <div className="bg-slate-900/80 p-6 rounded-xl border-l-4 border-cyan-500">
                                    <h4 className="text-lg font-bold text-white mb-2">Proposed Solution: Trainable Adapters</h4>
                                    <p className="text-slate-400 text-sm leading-relaxed">
                                        To mitigate this, we insert lightweight <strong>Trainable Adapters</strong> (Low-Rank Adaptation or similar) into the JEPA backbone. This allows the RL signal to tune attention toward task-specific features without destroying the pretrained general knowledge, maintaining the "adult-level" visual processing while allowing for task specialization.
                                    </p>
                                </div>
                            </div>
                        </div>
                    </div>

                    {/* Challenge 2: Validation Problem */}
                    <div className="bg-gradient-to-br from-red-900/20 to-slate-800/50 border border-red-500/20 rounded-2xl p-8 md:p-12">
                        <div className="flex items-start gap-6">
                            <div className="bg-red-500/10 p-3 rounded-full hidden md:block">
                                <Eye className="text-red-500" size={32} />
                            </div>
                            <div>
                                <h3 className="text-2xl font-bold mb-4 text-red-400">Challenge 2: The Validation Problem</h3>
                                <p className="text-slate-300 mb-6 leading-relaxed">
                                    Without a decoder to reconstruct pixel representations, it becomes significantly harder to validate that the hidden state <em>actually</em> represents the world state accurately. In standard Dreamer, poor reconstruction quality serves as a clear diagnostic signal that something is wrong with the latent representations. Removing this feedback loop makes debugging and verification more challenging.
                                </p>

                                <div className="bg-slate-900/80 p-6 rounded-xl border-l-4 border-purple-500">
                                    <h4 className="text-lg font-bold text-white mb-2">Proposed Solution: Alternative Validation Methods</h4>
                                    <p className="text-slate-400 text-sm leading-relaxed mb-3">
                                        We propose using <strong>proxy validation metrics</strong> to ensure representation quality:
                                    </p>
                                    <ul className="text-slate-400 text-sm space-y-2 list-disc list-inside">
                                        <li><strong>Latent prediction accuracy:</strong> Measure how well future latent states are predicted in V-JEPA space</li>
                                        <li><strong>Downstream task performance:</strong> Monitor RL reward signals and convergence speed as indirect validation</li>
                                        <li><strong>Probing classifiers:</strong> Train lightweight probes to predict known world properties (object positions, states) from latents</li>
                                        <li><strong>Optional sparse decoding:</strong> Periodically reconstruct a small batch of frames for qualitative inspection</li>
                                    </ul>
                                </div>
                            </div>
                        </div>
                    </div>
                </div>
            </section>
        );

        const Footer = () => (
            <footer className="py-12 border-t border-slate-800 text-center text-slate-500 text-sm">
                <div className="mb-4">
                    <span className="font-bold text-slate-300">Dreamer-JEPA Proposal</span> &copy; 2025
                </div>
                <p className="max-w-md mx-auto mb-6">
                    A hybrid architecture shifting Dreamer from a generative model to a purely predictive model.
                </p>
                <div className="flex justify-center gap-6">
                    <a href="#" className="hover:text-cyan-400 transition">Paper (Coming Soon)</a>
                    <a href="#" className="hover:text-cyan-400 transition">HuggingFace</a>
                    <a href="#" className="hover:text-cyan-400 transition">Contact</a>
                </div>
            </footer>
        );

        const App = () => (
            <div className="min-h-screen">
                <Nav />
                <Hero />
                <Abstract />
                <Motivation />
                <ArchitectureViewer />
                <Features />
                <Challenges />
                <Footer />
            </div>
        );

        const root = ReactDOM.createRoot(document.getElementById('root'));
        root.render(<App />);
    </script>
</body>
</html>