In this tutorial, we explore kvcached, a dynamic KV-cache implementation on top of vLLM, to understand how dynamic KV-cache allocation transforms GPU memory usage for large language models. We begin by setting up the environment and deploying lightweight Qwen2.5 models through an OpenAI-compatible API, ensuring a realistic inference workflow. We then design controlled experiments where we simulate bursty workloads to observe how memory behaves under both elastic and static allocation strategies.

Through systematic measurement and visualization, we directly compare VRAM utilization and latency, and extend the setup to a multi-model scenario where we observe how memory flexibly shifts across active workloads in real time. Copy CodeCopiedUse a different Browserimport os, sys, time, json, subprocess, threading, signal, shutil from pathlib import Path def sh(cmd, check=True): return subprocess.run(cmd, check=check, shell=isinstance(cmd, str)) try: import torch except ImportError: sh([sys.executable, "-m", "pip", "install", "-q", "torch"]) import torch assert torch.cuda.is_available(), \ "No GPU detected.

In Colab: Runtime > Change runtime type > GPU." props = torch.cuda.get_device_properties(0) print(f"[GPU] {torch.cuda.get_device_name(0)} " f"({props.total_memory / 1e9:.1f} GB, " f"compute capability {props.major}.{props.minor})") def pip_install(*pkgs, extra=()): subprocess.run([sys.executable, "-m", "pip", "install", "-q", *pkgs, *extra], check=True) print("[install] vLLM ...") pip_install("vllm==0.10.2") print("[install] kvcached (compiles a small CUDA extension) ...") pip_install("kvcached", extra=["--no-build-isolation"]) print("[install] misc (matplotlib, requests, pynvml) ...") pip_install("matplotlib", "requests", "pynvml", "numpy") MODEL_A = "Qwen/Qwen2.5-0.5B-Instruct" MODEL_B = "Qwen/Qwen2.5-1.5B-Instruct" PORT_A, PORT_B = 8001, 8002 MAX_MODEL_LEN = 2048 We start by setting up the environment and verifying that a GPU is available for our experiments. We install all required dependencies including vLLM and kvcached along with supporting libraries.

We then define our model configurations and ports to prepare for launching the inference servers. Copy CodeCopiedUse a different Browserdef launch_vllm(model, port, kvcached=True, gpu_mem_util=0.55, log_path=None): """Start a vLLM OpenAI-compatible server as a subprocess. With kvcached=True the autopatch hooks replace vLLM's KV-cache allocator with the elastic one.""" env = os.environ.copy() env["VLLM_USE_V1"] = "1" if kvcached: env["ENABLE_KVCACHED"] = "true" env["KVCACHED_AUTOPATCH"] = "1" env["KVCACHED_IPC_NAME"] = f"kvc_{port}" cmd = [ sys.executable, "-m", "vllm.entrypoints.openai.api_server", "--model", model, "--port", str(port), "--max-model-len", str(MAX_MODEL_LEN), "--disable-log-requests", "--no-enable-prefix-caching", "--enforce-eager", ] if not kvcached: cmd += ["--gpu-memory-utilization", str(gpu_mem_util)] log = open(log_path or os.devnull, "w") proc = subprocess.Popen(cmd, env=env, stdout=log, stderr=subprocess.STDOUT, preexec_fn=os.setsid) return proc, log def wait_ready(port, timeout=420): import requests url = f"http://localhost:{port}/v1/models" t0 = time.time() while time.time() - t0 6.0f} MB " f"baseline: {idle_base:>6.0f} MB " f"(savings: {idle_base - idle_kvc:>5.0f} MB)") print(f" Peak VRAM kvcached: {max(mk):>6.0f} MB " f"baseline: {max(mb):>6.0f} MB") print(f" Median lat. kvcached: {np.median(lat_kvc):>6.2f} s " f"baseline: {np.median(lat_base):>6.2f} s") print(f" VRAM flex kvcached: peak-idle = {max(mk)-min(mk):>5.0f} MB " f"(baseline can't release -- static pool)") print("\n=== Experiment 3: Two LLMs sharing one GPU (kvcached on both) ===") pA, lA = launch_vllm(MODEL_A, PORT_A, kvcached=True, log_path="/tmp/mA.log") try: wait_ready(PORT_A) pB, lB = launch_vllm(MODEL_B, PORT_B, kvcached=True, log_path="/tmp/mB.log") try: wait_ready(PORT_B) print(f" Both models loaded.

Idle VRAM: {vram_used_mb():.0f} MB") sampler = MemorySampler(); sampler.start() for i in range(4): port, model = ((PORT_A, MODEL_A) if i % 2 == 0 else (PORT_B, MODEL_B)) print(f" round {i+1}: driving {model}") bursty_workload(port, model, n_bursts=1, burst_size=4, pause=0) time.sleep(5) sampler.stop() t, m = zip(*sampler.samples) plt.figure(figsize=(11, 4.2)) plt.plot(t, m, color="#c2410c", linewidth=2) plt.xlabel("time (s)"); plt.ylabel("GPU memory used (MB)") plt.title("Two LLMs on one T4 via kvcached — memory flexes per active model") plt.grid(alpha=.3); plt.tight_layout() plt.savefig("/content/kvcached_multillm.png", dpi=120, bbox_inches="tight") plt.show() finally: shutdown(pB, lB) finally: shutdown(pA, lA) print("\n=== Bonus: kvcached ships CLI tools ===") print(" kvtop — live per-instance KV memory monitor (like nvtop for kvcached)") print(" kvctl — set/limit per-instance memory budgets in shared memory") for tool in ("kvtop", "kvctl"): path = shutil.which(tool) print(f" {tool}: {path or 'not on PATH'}") print("\nAll plots saved to /content/. Done.") We visuali