Skip to content

Resource Limits & Quotas

Comprehensive resource management, rate limiting, and quota enforcement for production deployments.

Overview

The Limits module (src/limits/) provides fine-grained control over resource consumption, API rate limiting, and usage quotas. This is essential for:

  • Cost Control: Prevent runaway LLM costs
  • Fair Usage: Multi-tenant rate limiting
  • System Protection: Prevent resource exhaustion
  • Compliance: Meet SLA requirements

Architecture

The limits system consists of four main components:

┌─────────────────┐
│    Enforcer     │  Main entry point
└────────┬────────┘
    ┌────┴─────────────────────────┐
    │                              │
┌───▼──────┐  ┌──────────┐  ┌─────▼──────┐
│ Governor │  │ Metrics  │  │   Quotas   │
│(Rate     │  │(Tracking)│  │(Budgets)   │
│Limiting) │  │          │  │            │
└──────────┘  └──────────┘  └────────────┘

Enforcer

Main coordination component that enforces all limits:

use stratarouter_runtime::limits::{Enforcer, LimitConfig};

let config = LimitConfig {
    max_concurrent_executions: 100,
    max_requests_per_minute: 1000,
    max_cost_per_hour: 50.0,
    max_memory_mb: 4096,
    enable_quotas: true,
};

let enforcer = Enforcer::new(config);

// Check if request is allowed
match enforcer.check_limits(tenant_id, request).await {
    Ok(()) => {
        // Execute request
        let result = execute(request).await?;

        // Record usage
        enforcer.record_usage(tenant_id, result.cost, result.latency).await;
    }
    Err(LimitExceeded::RateLimit) => {
        return Err("Rate limit exceeded, try again later");
    }
    Err(LimitExceeded::QuotaExceeded) => {
        return Err("Monthly quota exceeded");
    }
}

Configuration

pub struct LimitConfig {
    /// Maximum concurrent executions
    pub max_concurrent_executions: usize,

    /// Max requests per minute (per tenant)
    pub max_requests_per_minute: usize,

    /// Max cost per hour (USD)
    pub max_cost_per_hour: f64,

    /// Max memory usage (MB)
    pub max_memory_mb: usize,

    /// Enable quota tracking
    pub enable_quotas: bool,

    /// Enable metrics collection
    pub enable_metrics: bool,
}

Governor (Rate Limiting)

Token bucket algorithm for rate limiting:

use stratarouter_runtime::limits::governor::{Governor, RateLimitConfig};

// Per-tenant rate limiting
let config = RateLimitConfig {
    requests_per_second: 10.0,
    burst_size: 20,
};

let governor = Governor::new(config);

// Check if request is allowed
if governor.check_rate_limit(tenant_id).await? {
    // Process request
    execute_request().await?;
} else {
    // Rate limited
    let retry_after = governor.retry_after(tenant_id).await?;
    return Err(format!("Rate limited, retry after {}s", retry_after));
}

Token Bucket Algorithm

Tokens: ████████░░  (8/10 available)
Request consumes 1 token
Tokens: ███████░░░  (7/10 available)
Tokens refill at rate (e.g., 1/second)
Tokens: ████████░░  (8/10 available)

Configuration Options

pub struct RateLimitConfig {
    /// Maximum requests per second
    pub requests_per_second: f64,

    /// Burst allowance (tokens)
    pub burst_size: usize,

    /// Refill strategy
    pub refill_strategy: RefillStrategy,
}

pub enum RefillStrategy {
    /// Fixed interval refill
    FixedInterval { interval_ms: u64 },

    /// Continuous refill
    Continuous,
}

Per-Route Rate Limiting

Different limits for different routes:

let mut governor = Governor::new_multi();

// Free tier: 10 req/s
governor.add_limit("free_tier", RateLimitConfig {
    requests_per_second: 10.0,
    burst_size: 20,
});

// Pro tier: 100 req/s
governor.add_limit("pro_tier", RateLimitConfig {
    requests_per_second: 100.0,
    burst_size: 200,
});

// Check appropriate limit
let tier = get_user_tier(user_id);
if governor.check_rate_limit_for(tier, user_id).await? {
    // Process
}

Metrics Tracking

Detailed usage tracking for analytics and billing:

use stratarouter_runtime::limits::metrics::{MetricsCollector, Usage};

let collector = MetricsCollector::new();

// Record execution
collector.record_execution(Usage {
    tenant_id: "tenant_123".to_string(),
    route_id: "gpt4".to_string(),
    cost_usd: 0.03,
    latency_ms: 1250,
    tokens_used: 500,
    success: true,
    timestamp: Utc::now(),
}).await;

// Query metrics
let stats = collector.get_stats("tenant_123", time_range).await?;
println!("Total cost: ${:.2}", stats.total_cost);
println!("Average latency: {}ms", stats.avg_latency_ms);
println!("Success rate: {:.1}%", stats.success_rate * 100.0);

Metrics Data Structure

pub struct ExecutionMetrics {
    /// Total number of requests
    pub total_requests: usize,

    /// Successful requests
    pub successful_requests: usize,

    /// Failed requests
    pub failed_requests: usize,

    /// Total cost (USD)
    pub total_cost: f64,

    /// Average latency (ms)
    pub avg_latency_ms: f64,

    /// P50 latency (ms)
    pub p50_latency_ms: f64,

    /// P95 latency (ms)
    pub p95_latency_ms: f64,

    /// P99 latency (ms)
    pub p99_latency_ms: f64,

    /// Total tokens used
    pub total_tokens: usize,

    /// Success rate (0.0-1.0)
    pub success_rate: f64,

    /// Cost per request
    pub cost_per_request: f64,
}

Cost Tracking

Detailed cost attribution:

use stratarouter_runtime::limits::metrics::cost_tracker::CostTracker;

let tracker = CostTracker::new();

// Record costs by provider
tracker.record_cost(CostRecord {
    tenant_id: "tenant_123".to_string(),
    provider: "openai".to_string(),
    model: "gpt-4".to_string(),
    cost_usd: 0.03,
    timestamp: Utc::now(),
}).await;

// Get breakdown
let breakdown = tracker.get_cost_breakdown("tenant_123", time_range).await?;
for (provider, cost) in breakdown.by_provider {
    println!("{}: ${:.2}", provider, cost);
}
// Output:
// openai: $45.67
// anthropic: $23.45
// google: $12.34

Latency Tracking

Track performance metrics:

use stratarouter_runtime::limits::metrics::latency_tracker::LatencyTracker;

let tracker = LatencyTracker::new();

// Record latency
tracker.record_latency(LatencyRecord {
    tenant_id: "tenant_123".to_string(),
    route_id: "gpt4".to_string(),
    latency_ms: 1250,
    timestamp: Utc::now(),
}).await;

// Get percentiles
let stats = tracker.get_latency_stats("tenant_123", time_range).await?;
println!("P50: {}ms", stats.p50);
println!("P95: {}ms", stats.p95);
println!("P99: {}ms", stats.p99);

Success Rate Tracking

Monitor reliability:

use stratarouter_runtime::limits::metrics::success_tracker::SuccessTracker;

let tracker = SuccessTracker::new();

// Record outcome
tracker.record_outcome(OutcomeRecord {
    tenant_id: "tenant_123".to_string(),
    route_id: "gpt4".to_string(),
    success: true,
    error_type: None,
    timestamp: Utc::now(),
}).await;

// Get success rate
let rate = tracker.get_success_rate("tenant_123", time_range).await?;
println!("Success rate: {:.1}%", rate * 100.0);

// Get error breakdown
let errors = tracker.get_error_breakdown("tenant_123", time_range).await?;
for (error_type, count) in errors {
    println!("{}: {} occurrences", error_type, count);
}

Quotas

Budget-based usage limits:

use stratarouter_runtime::limits::quotas::{QuotaManager, Quota, QuotaPeriod};

let manager = QuotaManager::new();

// Set monthly quota
manager.set_quota("tenant_123", Quota {
    max_requests: 10000,
    max_cost_usd: 100.0,
    period: QuotaPeriod::Monthly,
}).await;

// Check quota before request
match manager.check_quota("tenant_123").await? {
    QuotaStatus::Available { remaining_requests, remaining_cost } => {
        // Execute request
        let result = execute().await?;

        // Deduct from quota
        manager.consume_quota("tenant_123", ConsumedQuota {
            requests: 1,
            cost_usd: result.cost,
        }).await;
    }
    QuotaStatus::Exceeded { exceeded_by } => {
        return Err("Monthly quota exceeded");
    }
}

Quota Types

pub enum QuotaPeriod {
    Hourly,
    Daily,
    Weekly,
    Monthly,
    Yearly,

    /// Custom period in seconds
    Custom(u64),
}

pub struct Quota {
    /// Maximum requests in period
    pub max_requests: usize,

    /// Maximum cost in period (USD)
    pub max_cost_usd: f64,

    /// Quota period
    pub period: QuotaPeriod,

    /// Hard limit (reject) or soft limit (warn)
    pub enforcement: QuotaEnforcement,
}

pub enum QuotaEnforcement {
    /// Hard limit - reject requests
    Hard,

    /// Soft limit - allow but warn
    Soft,

    /// Throttle - slow down requests
    Throttle { factor: f64 },
}

Quota Tracking

Monitor quota usage:

let usage = manager.get_quota_usage("tenant_123").await?;

println!("Requests: {}/{}", usage.used_requests, usage.max_requests);
println!("Cost: ${:.2}/${:.2}", usage.used_cost, usage.max_cost);
println!("Remaining: {:.1}%", usage.remaining_percent);

if usage.remaining_percent < 10.0 {
    warn!("Quota nearly exhausted for tenant_123");
}

Quota Alerts

Set up notifications:

manager.set_alert("tenant_123", QuotaAlert {
    threshold_percent: 80.0,
    notification_channel: "email:admin@example.com".to_string(),
}).await;

// Alert triggered when 80% of quota used

Complete Example

Production-ready configuration:

use stratarouter_runtime::limits::*;

#[tokio::main]
async fn main() -> Result<()> {
    // 1. Configure enforcer
    let config = LimitConfig {
        max_concurrent_executions: 100,
        max_requests_per_minute: 1000,
        max_cost_per_hour: 50.0,
        max_memory_mb: 4096,
        enable_quotas: true,
        enable_metrics: true,
    };

    let enforcer = Enforcer::new(config);

    // 2. Set up rate limits
    let mut governor = Governor::new_multi();
    governor.add_limit("free", RateLimitConfig {
        requests_per_second: 1.0,
        burst_size: 5,
    });
    governor.add_limit("pro", RateLimitConfig {
        requests_per_second: 10.0,
        burst_size: 20,
    });

    // 3. Set up quotas
    let quota_manager = QuotaManager::new();
    quota_manager.set_quota("tenant_123", Quota {
        max_requests: 100000,
        max_cost_usd: 500.0,
        period: QuotaPeriod::Monthly,
        enforcement: QuotaEnforcement::Hard,
    }).await;

    // 4. Set up metrics
    let metrics = MetricsCollector::new();

    // 5. Handle request
    async fn handle_request(
        tenant_id: &str,
        request: Request,
        enforcer: &Enforcer,
        governor: &Governor,
        quota_manager: &QuotaManager,
        metrics: &MetricsCollector,
    ) -> Result<Response> {
        // Check all limits
        enforcer.check_limits(tenant_id, &request).await?;

        let tier = get_user_tier(tenant_id);
        if !governor.check_rate_limit_for(tier, tenant_id).await? {
            return Err("Rate limit exceeded");
        }

        quota_manager.check_quota(tenant_id).await?;

        // Execute
        let start = Instant::now();
        let result = execute_request(request).await?;
        let latency_ms = start.elapsed().as_millis() as u64;

        // Record usage
        enforcer.record_usage(tenant_id, result.cost, latency_ms).await;
        quota_manager.consume_quota(tenant_id, ConsumedQuota {
            requests: 1,
            cost_usd: result.cost,
        }).await;

        metrics.record_execution(Usage {
            tenant_id: tenant_id.to_string(),
            route_id: result.route_id.clone(),
            cost_usd: result.cost,
            latency_ms,
            tokens_used: result.tokens,
            success: true,
            timestamp: Utc::now(),
        }).await;

        Ok(result.response)
    }

    Ok(())
}

Monitoring Dashboard

Query aggregated metrics:

// Get tenant overview
let overview = enforcer.get_tenant_overview("tenant_123").await?;

println!("=== Tenant Overview ===");
println!("Requests (24h): {}", overview.requests_24h);
println!("Cost (24h): ${:.2}", overview.cost_24h);
println!("Avg latency: {}ms", overview.avg_latency_ms);
println!("Success rate: {:.1}%", overview.success_rate * 100.0);
println!("Rate limit usage: {:.1}%", overview.rate_limit_usage_percent);
println!("Quota usage: {:.1}%", overview.quota_usage_percent);

See Also