GPU Capacity

LLM GPU capacity calculator — size your cluster in seconds

Pick the model, your peak concurrency, and your latency target. Get the exact GPU type and count for self-hosting, plus a Plugsky alternative that runs on shared infrastructure.

Workload

Skip the hardware math

Plugsky runs the same models on shared infrastructure — pay flat monthly, scale on demand.

Start $5 trial → Private endpoint
(function(){ // Rough sizing: VRAM needed, tokens/sec per GPU, then GPU count var MODELS = { '7b': { fp16: 14, int8: 8, int4: 5, tpsA100: 220, tpsH100: 420 }, '13b': { fp16: 26, int8: 14, int4: 8, tpsA100: 130, tpsH100: 250 }, '18b': { fp16: 36, int8: 20, int4: 12, tpsA100: 100, tpsH100: 200 }, '70b': { fp16: 140, int8: 70, int4: 40, tpsA100: 30, tpsH100: 70 }, '120b':{ fp16: 240, int8: 130, int4: 80, tpsA100: 18, tpsH100: 40 } }; function fmt(n){ return n.toLocaleString(undefined, {maximumFraction: 1}); } document.getElementById('calcBtn').onclick = function() { var m = document.getElementById('model').value; var users = parseInt(document.getElementById('users').value); var inp = parseInt(document.getElementById('inp').value); var out = parseInt(document.getElementById('out').value); var latTarget = parseInt(document.getElementById('latency').value); var quant = document.getElementById('quant').value; var cfg = MODELS[m]; var tokensPerReq = inp + out; var totalTokensPerSec = (users * tokensPerReq) / Math.max(1, latTarget / 2); var tpsA100 = cfg.tpsA100 * (quant === 'int4' ? 2 : quant === 'int8' ? 1 : 0.6); var tpsH100 = cfg.tpsH100 * (quant === 'int4' ? 2 : quant === 'int8' ? 1 : 0.6); var a100Needed = Math.ceil(totalTokensPerSec / tpsA100); var h100Needed = Math.ceil(totalTokensPerSec / tpsH100); var vramPerA100 = cfg[quant] * 1.3; // a bit of overhead var vramPerH100 = cfg[quant] * 1.3; var html = '
Required throughput
\n' + fmt(totalTokensPerSec) + ' tokens/sec at p95 ≤ ' + (latTarget/1000) + 's\n\n' + '
' + '

🅰 On A100 80GB

' + '
GPUs needed
\n' + a100Needed + ' × A100 80GB
' + '
VRAM per GPU
\n' + vramPerA100 + ' GB
' + '
Total VRAM
\n' + (a100Needed * 80) + ' GB
' + '
Annual cost (on-demand)
\n$' + (a100Needed * 30000).toLocaleString() + '
' + 'assumes 3-year reserved on-demand pricing
' + '

🅷 On H100 80GB

' + '
GPUs needed
\n' + h100Needed + ' × H100 80GB
' + '
VRAM per GPU
\n' + vramPerH100 + ' GB
' + '
Total VRAM
\n' + (h100Needed * 80) + ' GB
' + '
Annual cost (on-demand)
\n$' + (h100Needed * 50000).toLocaleString() + '
' + 'assumes 3-year reserved on-demand pricing
' + '
' + '
' + 'Method: tokens-per-second = (concurrent_users × tokens_per_request) / (latency_target ÷ 2). ' + 'GPUs needed = ceil(tokens_per_sec ÷ per-GPU throughput). ' + 'Quantization halves VRAM and roughly doubles throughput. ' + 'These are estimates — actual performance depends on framework (vLLM, TensorRT-LLM, SGLang), batching strategy, and prompt distribution.' + '
'; document.getElementById('result').style.display = 'block'; document.getElementById('result').innerHTML = html; }; document.getElementById('calcBtn').click(); })();