Complete guide to building high-performance ML APIs with FastAPI, Redis caching, and production-grade monitoring.
If you’re still spinning up Flask apps for ML inference in 2025, you’re leaving performance (and sanity) on the table.
FastAPI gives you:
For SabiScore, migrating to FastAPI was a key step in getting from 450ms to 87ms p95 latency (i.e., comfortably <100ms).
Here’s a simplified version of a structure that works well:
app/
main.py # FastAPI app, routes
api/
v1/
predict.py # /predict endpoint
core/
config.py # Settings, env vars
logging.py # Logging setup
monitoring.py # Prometheus metrics
models/
loader.py # Model loading, registry
features/
builder.py # Feature engineering
You want clear boundaries:
features doesn’t know anything about HTTPmodels doesn’t know anything about Redisapi just orchestrates.# app/main.py
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.api.v1 import predict
from app.core.config import settings
from app.models.loader import load_ensemble_model
app = FastAPI(title="SabiScore API", version="2.0")
app.add_middleware(
CORSMiddleware,
allow_origins=[settings.FRONTEND_ORIGIN],
allow_methods=["*"],
allow_headers=["*"],
)
@app.on_event("startup")
async def startup_event() -> None:
# Load model once at startup
app.state.model = load_ensemble_model(settings.MODEL_PATH)
@app.get("/health")
async def health() -> dict:
return {"status": "ok"}
app.include_router(predict.router, prefix="/api/v1")
The key detail: app.state.model is loaded once. No re-loading on each request.
# app/api/v1/predict.py
from fastapi import APIRouter, Depends, HTTPException, Request
from pydantic import BaseModel, Field
from app.features.builder import build_features
from app.core.cache import get_cache, set_cache
router = APIRouter()
class PredictionRequest(BaseModel):
home_team: str = Field(..., example="Arsenal")
away_team: str = Field(..., example="Chelsea")
league: str = Field(..., example="Premier League")
kickoff_time: str = Field(..., example="2024-12-01T16:30:00Z")
class PredictionResponse(BaseModel):
home_win_prob: float
draw_prob: float
away_win_prob: float
recommended_bet: str
confidence: str
reasoning: list[str]
Expressive types here make your API self-documenting.
# app/core/cache.py
import json
import aioredis
from app.core.config import settings
_redis = None
async def get_redis():
global _redis
if _redis is None:
_redis = await aioredis.from_url(settings.REDIS_URL, encoding="utf-8", decode_responses=True)
return _redis
async def get_cache(key: str) -> str | None:
redis = await get_redis()
return await redis.get(key)
async def set_cache(key: str, value: dict, ttl: int = 3600) -> None:
redis = await get_redis()
await redis.setex(key, ttl, json.dumps(value))
In the route:
# app/api/v1/predict.py (continued)
@router.post("/predict", response_model=PredictionResponse)
async def predict_match(request: Request, body: PredictionRequest):
cache_key = f"pred:{body.home_team}:{body.away_team}:{body.kickoff_time}"
cached = await get_cache(cache_key)
if cached:
return PredictionResponse(**json.loads(cached))
model = request.app.state.model
features = await build_features(body)
probs = model.predict_proba(features)
response = PredictionResponse(
home_win_prob=round(probs[2], 3),
draw_prob=round(probs[1], 3),
away_win_prob=round(probs[0], 3),
recommended_bet=choose_recommendation(probs),
confidence=rating_from_probs(probs),
reasoning=generate_reasoning(features, probs),
)
await set_cache(cache_key, response.dict())
return response
Result in production: 73% of requests served from cache.
# app/core/monitoring.py
from prometheus_client import Counter, Histogram
prediction_requests = Counter(
"prediction_requests_total",
"Total prediction requests",
["result"],
)
prediction_latency = Histogram(
"prediction_latency_seconds",
"Time spent serving prediction",
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0],
)
In your route:
import time
from app.core.monitoring import prediction_requests, prediction_latency
@router.post("/predict", response_model=PredictionResponse)
async def predict_match(request: Request, body: PredictionRequest):
start = time.perf_counter()
try:
# ... prediction logic ...
prediction_requests.labels(result="success").inc()
return response
except Exception as exc:
prediction_requests.labels(result="error").inc()
raise HTTPException(status_code=500, detail=str(exc)) from exc
finally:
prediction_latency.observe(time.perf_counter() - start)
With Grafana on top of Prometheus, you get a live dashboard:
/predict/batch when clients can use it.FastAPI is not magic. But paired with:
…it becomes an ideal host for production ML.
If you want to see these ideas applied end-to-end, read next:
Or if you’d rather skip the infrastructure pain and go straight to impact, reach out and let’s design your FastAPI-based ML system.