diff --git a/.env b/.env new file mode 100644 index 0000000..2889d71 --- /dev/null +++ b/.env @@ -0,0 +1,5 @@ +# Gemini API Key for local development/testing +# Note: For the GitHub Pages site, users will be prompted to enter their own API key +# which is stored in their browser's localStorage (not shared with the server) +# Get your free API key at: https://aistudio.google.com/apikey +GEMINI_API_KEY=AIzaSyB1kLWIDXGvwikSQyAbqhytf3wHf65aulQ \ No newline at end of file diff --git a/docs/.nojekyll b/docs/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/docs/cloudflare-worker.js b/docs/cloudflare-worker.js new file mode 100644 index 0000000..40267d4 --- /dev/null +++ b/docs/cloudflare-worker.js @@ -0,0 +1,120 @@ +/** + * TinyGPU Gemini API Proxy - Cloudflare Worker + * + * This worker proxies requests to the Gemini API, keeping your API key secure. + * Deploy this to Cloudflare Workers and set the GEMINI_API_KEY secret. + * + * Setup Instructions: + * 1. Go to https://dash.cloudflare.com/ and sign up/login + * 2. Go to Workers & Pages > Create Application > Create Worker + * 3. Name it something like "tinygpu-gemini-proxy" + * 4. Replace the default code with this file's contents + * 5. Go to Settings > Variables > Add Variable + * - Name: GEMINI_API_KEY + * - Value: Your Gemini API key + * - Click "Encrypt" to keep it secret + * 6. Save and Deploy + * 7. Your worker URL will be: https://tinygpu-gemini-proxy..workers.dev + */ + +export default { + async fetch(request, env) { + // Handle CORS preflight + if (request.method === "OPTIONS") { + return new Response(null, { + headers: { + "Access-Control-Allow-Origin": "*", + "Access-Control-Allow-Methods": "POST, OPTIONS", + "Access-Control-Allow-Headers": "Content-Type", + "Access-Control-Max-Age": "86400", + }, + }); + } + + // Only allow POST requests + if (request.method !== "POST") { + return new Response(JSON.stringify({ error: "Method not allowed" }), { + status: 405, + headers: { + "Content-Type": "application/json", + "Access-Control-Allow-Origin": "*", + }, + }); + } + + try { + // Get the request body + const body = await request.json(); + + // Validate required fields + if (!body.prompt) { + return new Response(JSON.stringify({ error: "Missing prompt" }), { + status: 400, + headers: { + "Content-Type": "application/json", + "Access-Control-Allow-Origin": "*", + }, + }); + } + + // Build Gemini API request + const geminiUrl = `https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=${env.GEMINI_API_KEY}`; + + const geminiPayload = { + contents: [{ parts: [{ text: body.prompt }] }], + }; + + // Add system instruction if provided + if (body.systemPrompt) { + geminiPayload.systemInstruction = { + parts: [{ text: body.systemPrompt }], + }; + } + + // Call Gemini API + const geminiResponse = await fetch(geminiUrl, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(geminiPayload), + }); + + if (!geminiResponse.ok) { + const errorText = await geminiResponse.text(); + return new Response( + JSON.stringify({ + error: "Gemini API error", + status: geminiResponse.status, + details: errorText, + }), + { + status: geminiResponse.status, + headers: { + "Content-Type": "application/json", + "Access-Control-Allow-Origin": "*", + }, + } + ); + } + + const data = await geminiResponse.json(); + const text = + data.candidates?.[0]?.content?.parts?.[0]?.text || + "No response generated."; + + return new Response(JSON.stringify({ text }), { + headers: { + "Content-Type": "application/json", + "Access-Control-Allow-Origin": "*", + }, + }); + } catch (error) { + return new Response(JSON.stringify({ error: error.message }), { + status: 500, + headers: { + "Content-Type": "application/json", + "Access-Control-Allow-Origin": "*", + }, + }); + } + }, +}; diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 0000000..34d025b --- /dev/null +++ b/docs/index.html @@ -0,0 +1,1773 @@ + + + + + + TinyGPU: Building a GPU Simulator in Python + + + + + + + + + + + +
+ +
+
+
+ +
+
+ PROJECT RETROSPECTIVE +
+

+ I Built a GPU Simulator + from Scratch in Python +

+

+ Moving from the opaque "black box" of parallel debugging to a + transparent, observable mental model. +

+ +
+
+ 512 + Threads +
+
+
+ 1 + Visualizer +
+
+
+ 0 + Hardware +
+
+
+
+ + +
+
+
+
+

+ The Motivation +

+

+ The Headache of
Parallel Debugging +

+

+ There is a specific kind of pain when debugging parallel code. You + launch 512 threads, and... silence. Or a race condition that + happens once every thousand runs. +

+

+ I realized I didn't actually understand how a GPU + schedules work. I knew the theory—SIMT, warps, + barriers—but I couldn't see it. +

+
+

+ "If the entire state of the GPU is just a set of NumPy arrays, + then the state is plottable." +

+
+
+ + +
+
+
+ +
+

+ Current Reality + +

+
+

+ > Segfault: Thread 42 out of bounds +

+

> Memory Access Violation (Address 0x004F)

+

> ... (Opaque hardware state)

+
+
+ +
+ + +
+

+ The Goal (TinyGPU) + +

+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

+ Visualizing memory hotspots in real-time +

+
+
+
+
+
+
+ + +
+
+
+

+ The "Glass Box" Architecture +

+

+ TinyGPU is designed to be fully observable. Click on the components + below to understand how the system transforms code into visual + insight. +

+
+ +
+ +
+
+ +
+

+ 1. The Assembler +

+

+ Parses .tgpu assembly files. Converts human-readable + text into numeric instructions. +

+
+ + +
+
+ +
+

+ 2. The Core (TinyGPU) +

+

+ The heavy lifter. Uses NumPy for vectorized state (Registers, + Memory, PC). Handles SIMT logic. +

+
+ + +
+
+ +
+

+ 3. The Visualizer +

+

+ The "Flight Recorder". Replays the execution history as a + frame-by-frame heatmap GIF. +

+
+
+ + +
+

+ The Core (TinyGPU) +

+
+
+

+ Instead of creating a Python object for every thread (which is + slow), TinyGPU uses NumPy for everything. The registers are a + single 2D array: + self.registers = np.zeros((num_threads, num_registers)). This mimics the SIMD nature of real hardware. +

+
    +
  • + Stores + 'PC', 'Registers', 'Memory', 'Flags' +
  • +
  • + Runs + the step() cycle +
  • +
  • + + Manages the 'Active Mask' for branching +
  • +
+
+
+
+
+
+
+
+
+class TinyGPU:
+    def __init__(self, num_threads, memory_size):
+        self.memory = np.zeros(memory_size)
+        self.registers = np.zeros((num_threads, 8))
+        self.pc = np.zeros(num_threads, dtype=int)
+        self.active = np.ones(num_threads, dtype=bool)
+
+
+
+
+
+ + +
+
+
+
+ Powered by Gemini +
+

+ AI Assembly Architect +

+

+ Writing assembly is hard. Describe a parallel algorithm logic below, + and the AI will generate the .tgpu assembly code using + the TinyGPU instruction set. +

+
+ +
+ +
+ + + + +
+ + +
+
+ +
+ +
+

+ + Generated Output (.tgpu) +

+ +
+ +
+Waiting for input...
+
+
+
+
+ + +
+
+
+ +
+

+ Visualizing the "Heartbeat" +

+

+ This interactive demo recreates the report's + Odd-Even Transposition Sort example. +

+

+ The Bar Chart represents Global Memory. Each bar + is a value. In a parallel sort, adjacent pairs are compared and + swapped simultaneously. +

+ +
+

+ Controls +

+ +
+ +
+ +
+ +
+
+
+ +
+ + +
+ + +
+
+ Phase: + IDLE +
+
+ Ops: + 0 +
+
+
+
+ + +
+
+
+

+ Global Memory +

+
+ + Value Magnitude +
+
+ + +
+ +
+ + +
+
+

+ Active Thread Mask +

+ Green = Executing +
+
+ +
+
+
+
+
+
+
+ + +
+
+

+ Key Engineering Insights +

+ +
+ +
+
+ +
+ +
+
+
+ +
+

+ Vectorized State +

+
+ +
+

+ The Insight: A GPU is just a state machine. If state is data, it + can be vectorized. +

+

+ Instead of looping 512 times in Python (slow), TinyGPU uses NumPy + slicing. ADD R0, R1, R2 becomes a single array + operation: regs[:,0] = regs[:,1] + regs[:,2]. This + aligns Python's strengths (C-backed arrays) with the GPU's nature + (SIMD). +

+
+ + +
+
+ +
+
+
+
+ +
+

+ The Active Mask +

+
+ +
+

+ The Challenge: What happens when Thread 0 takes the + if and Thread 1 takes the else? +

+

+ Real GPUs use an execution mask. In TinyGPU, I implemented + self.active, a boolean array. Instructions only + update state where active == True. Threads that don't + take the branch execute "no-ops" until paths converge. +

+
+ + +
+
+ +
+
+
+
+ +
+

+ Synchronization +

+
+ +
+

+ The Struggle: Implementing SYNC barriers in a serial + loop. +

+

+ I had to create a sync_waiting mask. Threads hit the + barrier, mark themselves waiting, and do nothing until + all(active_threads) are waiting. Debugging the + barrier logic itself was a meta-challenge. +

+
+ + +
+
+
+ +
+

+ Performance vs. Visibility +

+
+ +
+ +
+

+ I sacrificed raw speed for "Observability". It runs thousands of + ops/sec, not billions. But this slowness allows the "Flight + Recorder" to capture every single state change for replay. +

+
+
+
+
+ + + + + +
+
+

+ Closing Reflection +

+

+ "When you build a simulator, the magic dissolves. The GPU is no longer + a beast to be tamed; it’s just a machine looping over arrays." +

+ Observability is Feature #1. +

+ +
+
+

What Works

+
    +
  • + Visual + Intuition of barriers +
  • +
  • + + Deterministic Unit Testing +
  • +
  • + No Driver + Installation +
  • +
+
+
+

Limitations

+
    +
  • + + Pure Python speed (Slow) +
  • +
  • + + Simplified Cache Model +
  • +
  • + + Custom Toy ISA +
  • +
+
+
+

+ Future Roadmap +

+
    +
  • + Warp + Divergence Viz +
  • +
  • + + Python-to-TinyGPU Compiler +
  • +
  • + + Web-Based UI (You are here) +
  • +
+
+
+ + + Check out the Repo + +
+
+ + + + + + + diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 7708c40..0000000 --- a/docs/index.md +++ /dev/null @@ -1,123 +0,0 @@ -# TinyGPU 🐉⚡ — v2.0.0 - -[![Release v2.0.0](https://img.shields.io/badge/release-v2.0.0-blue.svg)](https://github.com/deaneeth/tinygpu/releases/tag/v2.0.0) -[![Python 3.13](https://img.shields.io/badge/Python-3.13-blue.svg)](https://www.python.org/downloads/) -[![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE) -[![CI](https://github.com/deaneeth/tinygpu/actions/workflows/ci.yml/badge.svg)](https://github.com/deaneeth/tinygpu/actions) -[![Tests](https://img.shields.io/github/actions/workflow/status/deaneeth/tinygpu/ci.yml?label=tests)](https://github.com/deaneeth/tinygpu/actions) -[![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) - -TinyGPU is a **tiny educational GPU simulator** — a minimal SIMT-style simulator with: - -- Per-thread registers & program counters -- Shared global memory and per-block shared memory -- A small GPU-style ISA and assembler -- Visualizer and GIF export for educational animations - -> 🎓 *Built for learning and visualization - see how threads, registers, and memory interact across cycles!* - ---- - -## 🚀 What's New in v2.0.0 - -- **Enhanced Instruction Set**: - - Added `SHLD` and `SHST` for robust shared memory operations. - - Improved `SYNC` semantics for better thread coordination. -- **Visualizer Improvements**: - - Export execution as GIFs with enhanced clarity. - - Added support for saving visuals directly from the simulator. -- **Refactored Core**: - - Simplified step semantics for better extensibility. - - Optimized performance for larger thread counts. -- **CI/CD Updates**: - - Integrated linting (`ruff`, `black`) and testing workflows. - - Automated builds and tests on GitHub Actions. -- **Documentation**: - - Expanded examples and added detailed usage instructions. - ---- - -## Quick Screenshots / Demos - -### Odd–Even Transposition Sort - -![Odd-Even Sort](../src/outputs/run_odd_even_sort/run_odd_even_sort_20251026-212558.gif) - -### Parallel Reduction (Sum) - -![Reduce Sum](../src/outputs/run_reduce_sum/run_reduce_sum_20251026-212712.gif) - ---- - -## Getting Started - -Clone and install (editable): - -```bash -git clone https://github.com/deaneeth/tinygpu.git -cd tinygpu -pip install -e . -pip install -r requirements-dev.txt -``` - -Run a demo (odd-even sort): - -```bash -python -m examples.run_odd_even_sort -``` - -> Produces: `outputs/run_odd_even_sort/run_odd_even_sort_*.gif` — a visual GPU-style sorting process. - ---- - -## Examples & Runners - -- `examples/run_vector_add.py` — simple parallel vector add -- `examples/run_vector_add_kernel.py` — vector add with kernel arguments -- `examples/run_test_loop.py` — branch/loop test (sum 1..4) -- `examples/run_test_cmp.py` — comparison and branching test -- `examples/run_test_kernel_args.py` — kernel arguments test -- `examples/run_odd_even_sort.py` — odd-even transposition sort (GIF) -- `examples/run_reduce_sum.py` — parallel reduction (GIF) -- `examples/run_block_shared_sum.py` — per-block shared memory example -- `examples/run_sync_test.py` — synchronization test -- `examples/debug_repl.py` — interactive REPL debugger - ---- - -## Instruction Set (Quick Reference) - -| **Instruction** | **Operands** | **Description** | -|-----------------------------|------------------------------------------|-----------------| -| `SET Rd, imm` | `Rd` = destination register, `imm` = immediate value | Set register `Rd` to an immediate constant. | -| `ADD Rd, Ra, Rb` | `Rd` = destination, `Ra` + `Rb` | Add two registers and store result in `Rd`. | -| `ADD Rd, Ra, imm` | `Rd` = destination, `Ra` + immediate | Add register and immediate value. | -| `MUL Rd, Ra, Rb` | Multiply two registers. | `Rd = Ra * Rb` | -| `MUL Rd, Ra, imm` | Multiply register by immediate. | `Rd = Ra * imm` | -| `LD Rd, addr` | Load from memory address into register. | `Rd = mem[addr]` | -| `LD Rd, Rk` | Load from address in register `Rk`. | `Rd = mem[Rk]` | -| `ST addr, Rs` | Store register into memory address. | `mem[addr] = Rs` | -| `ST Rk, Rs` | Store value from `Rs` into memory at address in register `Rk`. | `mem[Rk] = Rs` | -| `SHLD Rd, saddr` | Load from shared memory into register. | `Rd = shared_mem[saddr]` | -| `SHST saddr, Rs` | Store register into shared memory. | `shared_mem[saddr] = Rs` | -| `CSWAP addrA, addrB` | Compare-and-swap memory values. | If `mem[addrA] > mem[addrB]`, swap them. Used for sorting. | -| `CMP Ra, Rb` | Compare and set flags. | Set Z/N/G flags based on `Ra - Rb`. | -| `BRGT target` | Branch if greater. | Jump to `target` if G flag set. | -| `BRLT target` | Branch if less. | Jump to `target` if N flag set. | -| `BRZ target` | Branch if zero. | Jump to `target` if Z flag set. | -| `JMP target` | Label or immediate. | Unconditional jump — sets PC to `target`. | -| `SYNC` | *(no operands)* | Global synchronization barrier — all threads must reach this point. | -| `SYNCB` | *(no operands)* | Block-level synchronization barrier. | - ---- - -## Publishing & Contributing - -- See `.github/workflows/ci.yml` for CI and packaging -- To propose changes, open a PR. For bug reports, open an issue. - ---- - -## License - -MIT — See [LICENSE](../LICENSE).