Files
agi-lecture-dreamer-gepa/proposal/index.html
2025-11-25 10:50:52 +01:00

404 lines
23 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Dreamer-JEPA: Foundation World Models</title>
<!-- React & ReactDOM -->
<script crossorigin src="https://unpkg.com/react@18/umd/react.development.js"></script>
<script crossorigin src="https://unpkg.com/react-dom@18/umd/react-dom.development.js"></script>
<!-- Babel -->
<script src="https://unpkg.com/@babel/standalone/babel.min.js"></script>
<!-- Tailwind CSS -->
<script src="https://cdn.tailwindcss.com"></script>
<!-- Google Fonts -->
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;800&family=JetBrains+Mono:wght@400;700&display=swap" rel="stylesheet">
<style>
body {
font-family: 'Inter', sans-serif;
background-color: #0f172a; /* Slate 900 */
color: #e2e8f0; /* Slate 200 */
}
.font-mono {
font-family: 'JetBrains Mono', monospace;
}
.gradient-text {
background: linear-gradient(to right, #38bdf8, #818cf8);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
}
.glass-panel {
background: rgba(30, 41, 59, 0.7);
backdrop-filter: blur(10px);
border: 1px solid rgba(255, 255, 255, 0.1);
}
/* Custom diagram animations */
.flow-line {
stroke-dasharray: 10;
animation: dash 1s linear infinite;
}
@keyframes dash {
to {
stroke-dashoffset: -20;
}
}
</style>
</head>
<body>
<div id="root"></div>
<script type="text/babel">
const { useState, useEffect } = React;
// --- Icons (Inline SVGs to avoid CDN issues) ---
const IconBase = ({ children, size = 24, className = "" }) => (
<svg
xmlns="http://www.w3.org/2000/svg"
width={size}
height={size}
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
strokeWidth="2"
strokeLinecap="round"
strokeLinejoin="round"
className={className}
>
{children}
</svg>
);
const Icons = {
Brain: (props) => (
<IconBase {...props}>
<path d="M9.5 2A2.5 2.5 0 0 1 12 4.5v15a2.5 2.5 0 0 1-4.96.44 2.5 2.5 0 0 1-2.96-3.08 3 3 0 0 1-.34-5.58 2.5 2.5 0 0 1 1.32-4.24 2.5 2.5 0 0 1 1.98-3A2.5 2.5 0 0 1 9.5 2Z" />
<path d="M14.5 2A2.5 2.5 0 0 0 12 4.5v15a2.5 2.5 0 0 0 4.96.44 2.5 2.5 0 0 0 2.96-3.08 3 3 0 0 0 .34-5.58 2.5 2.5 0 0 0-1.32-4.24 2.5 2.5 0 0 0-1.98-3A2.5 2.5 0 0 0 14.5 2Z" />
</IconBase>
),
ArrowRight: (props) => (
<IconBase {...props}>
<line x1="5" y1="12" x2="19" y2="12" />
<polyline points="12 5 19 12 12 19" />
</IconBase>
),
Eye: (props) => (
<IconBase {...props}>
<path d="M2 12s3-7 10-7 10 7 10 7-3 7-10 7-10-7-10-7Z" />
<circle cx="12" cy="12" r="3" />
</IconBase>
),
Database: (props) => (
<IconBase {...props}>
<ellipse cx="12" cy="5" rx="9" ry="3" />
<path d="M21 12c0 1.66-4 3-9 3s-9-1.34-9-3" />
<path d="M3 5v14c0 1.66 4 3 9 3s9-1.34 9-3V5" />
</IconBase>
),
Zap: (props) => (
<IconBase {...props}>
<polygon points="13 2 3 14 12 14 11 22 21 10 12 10 13 2" />
</IconBase>
),
Layers: (props) => (
<IconBase {...props}>
<polygon points="12 2 2 7 12 12 22 7 12 2" />
<polyline points="2 17 12 22 22 17" />
<polyline points="2 12 12 17 22 12" />
</IconBase>
),
AlertTriangle: (props) => (
<IconBase {...props}>
<path d="m21.73 18-8-14a2 2 0 0 0-3.48 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z" />
<line x1="12" y1="9" x2="12" y2="13" />
<line x1="12" y1="17" x2="12.01" y2="17" />
</IconBase>
),
FileText: (props) => (
<IconBase {...props}>
<path d="M14.5 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V7.5L14.5 2z" />
<polyline points="14 2 14 8 20 8" />
<line x1="16" y1="13" x2="8" y2="13" />
<line x1="16" y1="17" x2="8" y2="17" />
<line x1="10" y1="9" x2="8" y2="9" />
</IconBase>
),
Github: (props) => (
<IconBase {...props}>
<path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22" />
</IconBase>
)
};
const { ArrowRight, Brain, Eye, Database, Zap, Layers, AlertTriangle, FileText, Github } = Icons;
// --- Components ---
const Nav = () => (
<nav className="fixed top-0 w-full z-50 glass-panel border-b border-slate-700/50">
<div className="max-w-6xl mx-auto px-6 h-16 flex items-center justify-between">
<div className="font-bold text-xl tracking-tight flex items-center gap-2">
<Brain className="text-cyan-400" size={24} />
<span>Dreamer<span className="text-cyan-400">-JEPA</span></span>
</div>
<div className="hidden md:flex space-x-8 text-sm font-medium text-slate-400">
<a href="#abstract" className="hover:text-white transition">Abstract</a>
<a href="#architecture" className="hover:text-white transition">Architecture</a>
<a href="#challenges" className="hover:text-white transition">Challenges</a>
</div>
</div>
</nav>
);
const Hero = () => (
<section className="pt-32 pb-20 px-6 max-w-5xl mx-auto text-center">
<div className="inline-block mb-4 px-3 py-1 rounded-full bg-cyan-900/30 border border-cyan-500/30 text-cyan-400 text-xs font-mono uppercase tracking-widest">
Research Proposal
</div>
<h1 className="text-5xl md:text-7xl font-extrabold mb-6 tracking-tight leading-tight">
Dreaming in <br className="hidden md:block" />
<span className="gradient-text">Concept Space</span>
</h1>
<p className="text-xl md:text-2xl text-slate-400 max-w-3xl mx-auto leading-relaxed mb-10">
Accelerating World Models by replacing pixel reconstruction with
Pretrained Joint-Embedding Predictive Architectures (JEPA).
</p>
<div className="flex flex-wrap justify-center gap-4">
<button className="flex items-center gap-2 bg-slate-100 text-slate-900 px-6 py-3 rounded-lg font-semibold hover:bg-white transition">
<FileText size={18} /> Read Proposal
</button>
<button className="flex items-center gap-2 bg-slate-800 text-white px-6 py-3 rounded-lg font-semibold hover:bg-slate-700 transition border border-slate-700">
<Github size={18} /> View Code
</button>
</div>
</section>
);
const Abstract = () => (
<section id="abstract" className="py-12 px-6 bg-slate-800/50 border-y border-slate-700/50">
<div className="max-w-4xl mx-auto">
<h2 className="text-sm font-bold text-slate-500 uppercase tracking-widest mb-4">Abstract</h2>
<p className="text-lg text-slate-300 leading-relaxed text-justify">
We propose replacing the standard trainable CNN encoder in the Dreamer architecture with a massive, pretrained Joint-Embedding Predictive Architecture (JEPA), such as Meta's V-JEPA. Instead of learning to see from scratch, the agent utilizes the frozen JEPA backbone to instantly process raw visual observations into rich, semantic latent embeddings. The Dreamers Recurrent State Space Model (RSSM) consumes these embeddings to learn dynamics, effectively "dreaming" in high-level concept space. This eliminates the computationally expensive image decoder and pixel-reconstruction loss, focusing purely on predictive modeling.
</p>
</div>
</section>
);
const ArchitectureViewer = () => {
const [mode, setMode] = useState('jepa'); // 'standard' or 'jepa'
return (
<section id="architecture" className="py-24 px-6 max-w-6xl mx-auto">
<div className="flex flex-col md:flex-row justify-between items-end mb-12">
<div>
<h2 className="text-3xl font-bold mb-4">Architecture Comparison</h2>
<p className="text-slate-400 max-w-lg">
Toggle to see how JEPA integration streamlines the World Model pipeline compared to the standard approach.
</p>
</div>
<div className="bg-slate-800 p-1 rounded-lg inline-flex mt-6 md:mt-0">
<button
onClick={() => setMode('standard')}
className={`px-4 py-2 rounded-md text-sm font-medium transition ${mode === 'standard' ? 'bg-slate-600 text-white shadow-lg' : 'text-slate-400 hover:text-white'}`}
>
Standard Dreamer
</button>
<button
onClick={() => setMode('jepa')}
className={`px-4 py-2 rounded-md text-sm font-medium transition ${mode === 'jepa' ? 'bg-cyan-600 text-white shadow-lg' : 'text-slate-400 hover:text-white'}`}
>
Proposed (JEPA)
</button>
</div>
</div>
{/* Diagram Container */}
<div className="glass-panel rounded-2xl p-8 md:p-12 relative overflow-hidden min-h-[400px] flex flex-col justify-center">
{/* Pipeline Visualization */}
<div className="relative z-10 flex flex-col md:flex-row gap-4 items-center justify-center">
{/* Input */}
<div className="flex flex-col items-center">
<div className="w-24 h-24 bg-slate-900 border-2 border-slate-700 rounded-xl flex items-center justify-center mb-4 relative">
<Eye className="text-slate-500" size={32} />
<div className="absolute -bottom-2 bg-slate-800 text-[10px] px-2 py-0.5 rounded text-slate-400">Obs (x)</div>
</div>
<span className="text-xs font-mono text-slate-500 uppercase">Input</span>
</div>
<ArrowRight className="hidden md:block text-slate-600 mx-auto" />
{/* Encoder Stage */}
<div className={`p-6 rounded-xl border-2 transition-all duration-500 flex flex-col items-center text-center relative ${
mode === 'standard'
? 'border-rose-500/30 bg-rose-500/10'
: 'border-cyan-500/30 bg-cyan-500/10'
}`}>
{mode === 'jepa' && (
<div className="absolute -top-3 right-4 bg-cyan-600 text-white text-[10px] font-bold px-2 py-1 rounded-full">
FROZEN
</div>
)}
<Layers className={`mb-3 ${mode === 'standard' ? 'text-rose-400' : 'text-cyan-400'}`} size={32} />
<h3 className="font-bold mb-1">
{mode === 'standard' ? 'Trainable CNN' : 'V-JEPA Backbone'}
</h3>
<p className="text-xs opacity-70 mb-2">
{mode === 'standard' ? 'Learns from scratch' : 'Pretrained on millions of videos'}
</p>
{mode === 'jepa' && (
<div className="mt-2 bg-cyan-900/50 px-2 py-1 rounded text-[10px] border border-cyan-500/30 text-cyan-300 w-full">
+ Lightweight Adapters
</div>
)}
</div>
<ArrowRight className="hidden md:block text-slate-600 mx-auto" />
{/* World Model Stage */}
<div className="p-6 rounded-xl border-2 border-indigo-500/30 bg-indigo-500/10 flex flex-col items-center text-center">
<Brain className="text-indigo-400 mb-3" size={32} />
<h3 className="font-bold mb-1">RSSM</h3>
<p className="text-xs opacity-70">
Recurrent State Space Model
</p>
<div className="mt-4 w-full h-1 bg-indigo-900/50 rounded-full overflow-hidden">
<div className="h-full bg-indigo-500 animate-pulse w-2/3"></div>
</div>
<span className="text-[10px] mt-1 text-indigo-300">Latent Dynamics</span>
</div>
</div>
{/* Reconstruction / Objective Branch (Bottom) */}
<div className="mt-12 pt-8 border-t border-dashed border-slate-700 relative">
<div className="absolute top-0 left-1/2 -translate-x-1/2 -translate-y-1/2 bg-slate-900 px-4 text-xs text-slate-500 font-mono">
OBJECTIVE FUNCTION
</div>
{mode === 'standard' ? (
<div className="flex flex-col items-center animate-in fade-in duration-500">
<div className="h-12 w-0.5 bg-rose-500/50 mb-2"></div>
<div className="border border-rose-500/50 bg-rose-900/10 p-4 rounded-lg text-center max-w-xs">
<h4 className="text-rose-400 font-bold text-sm">Image Decoder</h4>
<p className="text-xs text-slate-400 mt-1">Reconstructs pixels (Expensive)</p>
<div className="mt-2 font-mono text-[10px] text-rose-300">Loss: ||x - x̂||²</div>
</div>
</div>
) : (
<div className="flex flex-col items-center animate-in fade-in duration-500">
<div className="h-12 w-0.5 bg-cyan-500/50 mb-2"></div>
<div className="border border-cyan-500/50 bg-cyan-900/10 p-4 rounded-lg text-center max-w-xs">
<h4 className="text-cyan-400 font-bold text-sm">Latent Prediction</h4>
<p className="text-xs text-slate-400 mt-1">Predicts future features (Efficient)</p>
<div className="mt-2 font-mono text-[10px] text-cyan-300">Loss: Latent Space Only</div>
</div>
</div>
)}
</div>
</div>
</section>
);
};
const Features = () => (
<section className="py-20 bg-slate-900">
<div className="max-w-6xl mx-auto px-6 grid grid-cols-1 md:grid-cols-3 gap-8">
<div className="p-6 bg-slate-800 rounded-xl border border-slate-700 hover:border-cyan-500/50 transition duration-300">
<div className="w-12 h-12 bg-cyan-900/50 rounded-lg flex items-center justify-center mb-4">
<Zap className="text-cyan-400" />
</div>
<h3 className="text-xl font-bold mb-2">Immediate Robustness</h3>
<p className="text-slate-400 leading-relaxed">
Leveraging foundation models trained on vast datasets, the agent handles visual noise (leaves, static) and complex geometry instantly, bypassing the millions of steps usually needed for visual convergence.
</p>
</div>
<div className="p-6 bg-slate-800 rounded-xl border border-slate-700 hover:border-purple-500/50 transition duration-300">
<div className="w-12 h-12 bg-purple-900/50 rounded-lg flex items-center justify-center mb-4">
<Database className="text-purple-400" />
</div>
<h3 className="text-xl font-bold mb-2">Mathematical Synergy</h3>
<p className="text-slate-400 leading-relaxed">
V-JEPA's pretraining objectivepredicting latent representations of future framesaligns perfectly with Dreamer's goal of predicting future world states, creating a unified predictive pipeline.
</p>
</div>
<div className="p-6 bg-slate-800 rounded-xl border border-slate-700 hover:border-green-500/50 transition duration-300">
<div className="w-12 h-12 bg-green-900/50 rounded-lg flex items-center justify-center mb-4">
<Layers className="text-green-400" />
</div>
<h3 className="text-xl font-bold mb-2">No Decoder</h3>
<p className="text-slate-400 leading-relaxed">
By removing the need to reconstruct pixels, we eliminate the modeling of irrelevant details. The model "dreams" strictly in concepts, significantly reducing computational overhead.
</p>
</div>
</div>
</section>
);
const Challenges = () => (
<section id="challenges" className="py-24 px-6 max-w-5xl mx-auto">
<div className="bg-gradient-to-br from-amber-900/20 to-slate-800/50 border border-amber-500/20 rounded-2xl p-8 md:p-12">
<div className="flex items-start gap-6">
<div className="bg-amber-500/10 p-3 rounded-full hidden md:block">
<AlertTriangle className="text-amber-500" size={32} />
</div>
<div>
<h2 className="text-2xl font-bold mb-4 text-amber-500">Critical Challenge: The "Red Light" Problem</h2>
<p className="text-slate-300 mb-6 leading-relaxed">
A significant risk of using a completely frozen encoder is the potential filtering of tiny, task-relevant details. A small red light might be statistically insignificant in general internet video data (V-JEPA's training set) but critical for a specific RL task (e.g., a braking signal).
</p>
<div className="bg-slate-900/80 p-6 rounded-xl border-l-4 border-cyan-500">
<h3 className="text-lg font-bold text-white mb-2">Proposed Solution: Trainable Adapters</h3>
<p className="text-slate-400 text-sm leading-relaxed">
To mitigate this, we insert lightweight <strong>Trainable Adapters</strong> (Low-Rank Adaptation or similar) into the JEPA backbone. This allows the RL signal to tune attention toward task-specific features without destroying the pretrained general knowledge, maintaining the "adult-level" visual processing while allowing for task specialization.
</p>
</div>
</div>
</div>
</div>
</section>
);
const Footer = () => (
<footer className="py-12 border-t border-slate-800 text-center text-slate-500 text-sm">
<div className="mb-4">
<span className="font-bold text-slate-300">Dreamer-JEPA Proposal</span> &copy; 2025
</div>
<p className="max-w-md mx-auto mb-6">
A hybrid architecture shifting Dreamer from a generative model to a purely predictive model.
</p>
<div className="flex justify-center gap-6">
<a href="#" className="hover:text-cyan-400 transition">Paper (Coming Soon)</a>
<a href="#" className="hover:text-cyan-400 transition">HuggingFace</a>
<a href="#" className="hover:text-cyan-400 transition">Contact</a>
</div>
</footer>
);
const App = () => (
<div className="min-h-screen">
<Nav />
<Hero />
<Abstract />
<ArchitectureViewer />
<Features />
<Challenges />
<Footer />
</div>
);
const root = ReactDOM.createRoot(document.getElementById('root'));
root.render(<App />);
</script>
</body>
</html>