Merge pull request 'feat: scaffold Model Gateway Rust project (#38)' (#136) from feature/issue-38-scaffold-model-gateway into main

2026-03-10 13:43:22 +01:00
parent 38d3dd3953 c22719e9cf
commit f0d2ab99d3
9 changed files with 570 additions and 0 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1450,6 +1450,25 @@ dependencies = [
 "windows-sys 0.61.2",
 ]

+[[package]]
+name = "model-gateway"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "llm-multiverse-proto",
+ "prost",
+ "prost-types",
+ "serde",
+ "tempfile",
+ "thiserror",
+ "tokio",
+ "tokio-stream",
+ "toml",
+ "tonic",
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "multimap"
 version = "0.10.1"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,4 +5,5 @@ members = [
    "services/audit",
    "services/secrets",
    "services/memory",
+    "services/model-gateway",
 ]
--- a/implementation-plans/_index.md
+++ b/implementation-plans/_index.md
@@ -41,6 +41,7 @@
 | #35 | Implement QueryMemory gRPC endpoint (server-streaming) | Phase 4 | `COMPLETED` | Rust | [issue-035.md](issue-035.md) |
 | #36 | Implement GetCorrelated gRPC endpoint | Phase 4 | `COMPLETED` | Rust | [issue-036.md](issue-036.md) |
 | #37 | Integration tests for Memory Service | Phase 4 | `COMPLETED` | Rust | [issue-037.md](issue-037.md) |
+| #38 | Scaffold Model Gateway Rust project | Phase 5 | `IMPLEMENTING` | Rust | [issue-038.md](issue-038.md) |

 ## Status Legend

--- a/implementation-plans/issue-038.md
+++ b/implementation-plans/issue-038.md
@@ -0,0 +1,48 @@
+# Implementation Plan — Issue #38: Scaffold Model Gateway Rust project
+
+## Metadata
+
+| Field | Value |
+|---|---|
+| Issue | [#38](https://git.shahondin1624.de/llm-multiverse/llm-multiverse/issues/38) |
+| Title | Scaffold Model Gateway Rust project |
+| Milestone | Phase 5: Model Gateway |
+| Labels | |
+| Status | `IMPLEMENTING` |
+| Language | Rust |
+| Related Plans | [issue-018.md](issue-018.md), [issue-022.md](issue-022.md), [issue-027.md](issue-027.md) |
+| Blocked by | #16 (completed) |
+
+## Acceptance Criteria
+
+- [ ] Cargo workspace member created (`services/model-gateway/`)
+- [ ] Dependency on proto-gen crate
+- [ ] Tonic gRPC server boilerplate compiles
+- [ ] Configuration loading (address, port, Ollama URL, model routing config)
+- [ ] Health check endpoint responds
+
+## Implementation Steps
+
+1. Add workspace member to root Cargo.toml
+2. Create `services/model-gateway/Cargo.toml` with proto-gen dependency
+3. Create `src/lib.rs` with module exports
+4. Create `src/config.rs` with Config + ModelRoutingConfig (TOML loading, serde defaults)
+5. Create `src/service.rs` with ModelGatewayService trait impl (IsModelReady + stubs)
+6. Create `src/main.rs` with tonic server entry point
+7. Unit tests for config
+
+## Files to Create/Modify
+
+| File | Action | Purpose |
+|---|---|---|
+| `Cargo.toml` | Modify | Add workspace member |
+| `services/model-gateway/Cargo.toml` | Create | Crate manifest |
+| `services/model-gateway/src/lib.rs` | Create | Module exports |
+| `services/model-gateway/src/main.rs` | Create | Server entry point |
+| `services/model-gateway/src/config.rs` | Create | Configuration with TOML loading |
+| `services/model-gateway/src/service.rs` | Create | gRPC service impl |
+
+## Deviation Log
+
+| Deviation | Reason |
+|---|---|
--- a/services/model-gateway/Cargo.toml
+++ b/services/model-gateway/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "model-gateway"
+version = "0.1.0"
+edition = "2021"
+description = "Model Gateway service for llm-multiverse — wraps Ollama HTTP API via gRPC"
+publish = false
+
+[dependencies]
+llm-multiverse-proto = { path = "../../gen/rust" }
+tonic = "0.13"
+prost = "0.13"
+prost-types = "0.13"
+tokio = { version = "1", features = ["full"] }
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+serde = { version = "1", features = ["derive"] }
+toml = "0.8"
+anyhow = "1"
+thiserror = "2"
+tokio-stream = "0.1"
+
+[dev-dependencies]
+tempfile = "3"
--- a/services/model-gateway/src/config.rs
+++ b/services/model-gateway/src/config.rs
@@ -0,0 +1,250 @@
+use serde::Deserialize;
+use std::collections::HashMap;
+
+/// Model routing configuration — maps task complexity to specific model names.
+#[derive(Debug, Clone, Deserialize)]
+pub struct ModelRoutingConfig {
+    /// Default model for unspecified complexity (default: "llama3.2:3b").
+    #[serde(default = "default_default_model")]
+    pub default_model: String,
+
+    /// Model for simple tasks (default: "llama3.2:3b").
+    #[serde(default = "default_simple_model")]
+    pub simple_model: String,
+
+    /// Model for complex tasks (default: "llama3.2:14b").
+    #[serde(default = "default_complex_model")]
+    pub complex_model: String,
+
+    /// Model for embedding generation (default: "nomic-embed-text").
+    #[serde(default = "default_embedding_model")]
+    pub embedding_model: String,
+
+    /// Additional model aliases (e.g., "code" -> "codellama:7b").
+    #[serde(default)]
+    pub aliases: HashMap<String, String>,
+}
+
+fn default_default_model() -> String {
+    "llama3.2:3b".to_string()
+}
+
+fn default_simple_model() -> String {
+    "llama3.2:3b".to_string()
+}
+
+fn default_complex_model() -> String {
+    "llama3.2:14b".to_string()
+}
+
+fn default_embedding_model() -> String {
+    "nomic-embed-text".to_string()
+}
+
+impl Default for ModelRoutingConfig {
+    fn default() -> Self {
+        Self {
+            default_model: default_default_model(),
+            simple_model: default_simple_model(),
+            complex_model: default_complex_model(),
+            embedding_model: default_embedding_model(),
+            aliases: HashMap::new(),
+        }
+    }
+}
+
+impl ModelRoutingConfig {
+    /// Return a list of all unique configured model names.
+    pub fn all_models(&self) -> Vec<String> {
+        let mut models = vec![
+            self.default_model.clone(),
+            self.simple_model.clone(),
+            self.complex_model.clone(),
+            self.embedding_model.clone(),
+        ];
+        for v in self.aliases.values() {
+            models.push(v.clone());
+        }
+        models.sort();
+        models.dedup();
+        models
+    }
+}
+
+/// Top-level configuration for the Model Gateway service.
+#[derive(Debug, Clone, Deserialize)]
+pub struct Config {
+    /// Listen host (default: "[::1]").
+    #[serde(default = "default_host")]
+    pub host: String,
+
+    /// Listen port (default: 50055).
+    #[serde(default = "default_port")]
+    pub port: u16,
+
+    /// Ollama HTTP API base URL (default: "http://localhost:11434").
+    #[serde(default = "default_ollama_url")]
+    pub ollama_url: String,
+
+    /// Audit service gRPC address for logging.
+    pub audit_addr: Option<String>,
+
+    /// Model routing configuration.
+    #[serde(default)]
+    pub routing: ModelRoutingConfig,
+}
+
+fn default_host() -> String {
+    "[::1]".to_string()
+}
+
+fn default_port() -> u16 {
+    50055
+}
+
+fn default_ollama_url() -> String {
+    "http://localhost:11434".to_string()
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            host: default_host(),
+            port: default_port(),
+            ollama_url: default_ollama_url(),
+            audit_addr: None,
+            routing: ModelRoutingConfig::default(),
+        }
+    }
+}
+
+impl Config {
+    /// Load config from file, falling back to defaults.
+    pub fn load(path: Option<&str>) -> anyhow::Result<Self> {
+        match path {
+            Some(p) => {
+                let contents = std::fs::read_to_string(p)?;
+                let config: Config = toml::from_str(&contents)?;
+                Ok(config)
+            }
+            None => Ok(Self::default()),
+        }
+    }
+
+    pub fn listen_addr(&self) -> String {
+        format!("{}:{}", self.host, self.port)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_default_config() {
+        let config = Config::default();
+        assert_eq!(config.host, "[::1]");
+        assert_eq!(config.port, 50055);
+        assert_eq!(config.ollama_url, "http://localhost:11434");
+        assert!(config.audit_addr.is_none());
+        assert_eq!(config.routing.default_model, "llama3.2:3b");
+        assert_eq!(config.routing.embedding_model, "nomic-embed-text");
+    }
+
+    #[test]
+    fn test_listen_addr() {
+        let config = Config::default();
+        assert_eq!(config.listen_addr(), "[::1]:50055");
+    }
+
+    #[test]
+    fn test_load_from_toml() {
+        let dir = tempfile::tempdir().unwrap();
+        let config_path = dir.path().join("gateway.toml");
+        std::fs::write(
+            &config_path,
+            r#"
+host = "0.0.0.0"
+port = 9999
+ollama_url = "http://gpu-server:11434"
+audit_addr = "http://[::1]:50052"
+
+[routing]
+default_model = "mistral:7b"
+simple_model = "phi3:mini"
+complex_model = "llama3.2:70b"
+embedding_model = "mxbai-embed-large"
+
+[routing.aliases]
+code = "codellama:7b"
+"#,
+        )
+        .unwrap();
+
+        let config = Config::load(Some(config_path.to_str().unwrap())).unwrap();
+        assert_eq!(config.host, "0.0.0.0");
+        assert_eq!(config.port, 9999);
+        assert_eq!(config.ollama_url, "http://gpu-server:11434");
+        assert_eq!(config.audit_addr.as_deref(), Some("http://[::1]:50052"));
+        assert_eq!(config.routing.default_model, "mistral:7b");
+        assert_eq!(config.routing.simple_model, "phi3:mini");
+        assert_eq!(config.routing.complex_model, "llama3.2:70b");
+        assert_eq!(config.routing.embedding_model, "mxbai-embed-large");
+        assert_eq!(config.routing.aliases.get("code").unwrap(), "codellama:7b");
+    }
+
+    #[test]
+    fn test_load_no_file_uses_defaults() {
+        let config = Config::load(None).unwrap();
+        assert_eq!(config.port, 50055);
+    }
+
+    #[test]
+    fn test_routing_defaults() {
+        let routing = ModelRoutingConfig::default();
+        assert_eq!(routing.default_model, "llama3.2:3b");
+        assert_eq!(routing.simple_model, "llama3.2:3b");
+        assert_eq!(routing.complex_model, "llama3.2:14b");
+        assert_eq!(routing.embedding_model, "nomic-embed-text");
+        assert!(routing.aliases.is_empty());
+    }
+
+    #[test]
+    fn test_all_models() {
+        let mut routing = ModelRoutingConfig::default();
+        routing.aliases.insert("code".into(), "codellama:7b".into());
+
+        let models = routing.all_models();
+        assert!(models.contains(&"llama3.2:3b".to_string()));
+        assert!(models.contains(&"llama3.2:14b".to_string()));
+        assert!(models.contains(&"nomic-embed-text".to_string()));
+        assert!(models.contains(&"codellama:7b".to_string()));
+    }
+
+    #[test]
+    fn test_all_models_deduplicates() {
+        let routing = ModelRoutingConfig::default();
+        // default_model and simple_model are both "llama3.2:3b"
+        let models = routing.all_models();
+        let count = models.iter().filter(|m| *m == "llama3.2:3b").count();
+        assert_eq!(count, 1);
+    }
+
+    #[test]
+    fn test_routing_from_toml_uses_defaults_when_omitted() {
+        let dir = tempfile::tempdir().unwrap();
+        let config_path = dir.path().join("gateway.toml");
+        std::fs::write(
+            &config_path,
+            r#"
+host = "0.0.0.0"
+port = 9999
+"#,
+        )
+        .unwrap();
+
+        let config = Config::load(Some(config_path.to_str().unwrap())).unwrap();
+        assert_eq!(config.routing.default_model, "llama3.2:3b");
+        assert_eq!(config.routing.embedding_model, "nomic-embed-text");
+    }
+}
--- a/services/model-gateway/src/lib.rs
+++ b/services/model-gateway/src/lib.rs
@@ -0,0 +1,2 @@
+pub mod config;
+pub mod service;
--- a/services/model-gateway/src/main.rs
+++ b/services/model-gateway/src/main.rs
@@ -0,0 +1,44 @@
+use llm_multiverse_proto::llm_multiverse::v1::model_gateway_service_server::ModelGatewayServiceServer;
+use model_gateway::config::Config;
+use model_gateway::service::ModelGatewayServiceImpl;
+use tonic::transport::Server;
+use tracing_subscriber::EnvFilter;
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(
+            EnvFilter::from_default_env().add_directive("model_gateway=info".parse()?),
+        )
+        .init();
+
+    let config_path = std::env::var("MODEL_GATEWAY_CONFIG").ok();
+    let config = Config::load(config_path.as_deref())?;
+
+    tracing::info!(
+        addr = %config.listen_addr(),
+        ollama_url = %config.ollama_url,
+        default_model = %config.routing.default_model,
+        "Starting Model Gateway"
+    );
+
+    let addr = config.listen_addr().parse()?;
+    let service = ModelGatewayServiceImpl::new(config);
+
+    tracing::info!(%addr, "Model Gateway listening");
+
+    Server::builder()
+        .add_service(ModelGatewayServiceServer::new(service))
+        .serve_with_shutdown(addr, shutdown_signal())
+        .await?;
+
+    tracing::info!("Model Gateway shut down gracefully");
+    Ok(())
+}
+
+async fn shutdown_signal() {
+    tokio::signal::ctrl_c()
+        .await
+        .expect("failed to listen for ctrl+c");
+    tracing::info!("Shutdown signal received");
+}
--- a/services/model-gateway/src/service.rs
+++ b/services/model-gateway/src/service.rs
@@ -0,0 +1,182 @@
+use llm_multiverse_proto::llm_multiverse::v1::model_gateway_service_server::ModelGatewayService;
+use llm_multiverse_proto::llm_multiverse::v1::{
+    GenerateEmbeddingRequest, GenerateEmbeddingResponse, InferenceRequest, InferenceResponse,
+    IsModelReadyRequest, IsModelReadyResponse, StreamInferenceRequest, StreamInferenceResponse,
+};
+use tonic::{Request, Response, Status};
+
+use crate::config::Config;
+
+/// Implementation of the ModelGatewayService gRPC trait.
+pub struct ModelGatewayServiceImpl {
+    config: Config,
+}
+
+impl ModelGatewayServiceImpl {
+    pub fn new(config: Config) -> Self {
+        Self { config }
+    }
+}
+
+#[tonic::async_trait]
+impl ModelGatewayService for ModelGatewayServiceImpl {
+    type StreamInferenceStream =
+        tokio_stream::wrappers::ReceiverStream<Result<StreamInferenceResponse, Status>>;
+
+    async fn stream_inference(
+        &self,
+        _request: Request<StreamInferenceRequest>,
+    ) -> Result<Response<Self::StreamInferenceStream>, Status> {
+        Err(Status::unimplemented(
+            "StreamInference not yet implemented",
+        ))
+    }
+
+    async fn inference(
+        &self,
+        _request: Request<InferenceRequest>,
+    ) -> Result<Response<InferenceResponse>, Status> {
+        Err(Status::unimplemented("Inference not yet implemented"))
+    }
+
+    async fn generate_embedding(
+        &self,
+        _request: Request<GenerateEmbeddingRequest>,
+    ) -> Result<Response<GenerateEmbeddingResponse>, Status> {
+        Err(Status::unimplemented(
+            "GenerateEmbedding not yet implemented",
+        ))
+    }
+
+    async fn is_model_ready(
+        &self,
+        request: Request<IsModelReadyRequest>,
+    ) -> Result<Response<IsModelReadyResponse>, Status> {
+        let req = request.into_inner();
+        let all_models = self.config.routing.all_models();
+
+        if let Some(model_name) = &req.model_name {
+            let available = all_models.contains(model_name);
+            Ok(Response::new(IsModelReadyResponse {
+                ready: available,
+                available_models: if available {
+                    vec![model_name.clone()]
+                } else {
+                    vec![]
+                },
+                error_message: if available {
+                    None
+                } else {
+                    Some(format!("Model '{}' is not configured", model_name))
+                },
+            }))
+        } else {
+            Ok(Response::new(IsModelReadyResponse {
+                ready: true,
+                available_models: all_models,
+                error_message: None,
+            }))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn test_config() -> Config {
+        Config::default()
+    }
+
+    #[tokio::test]
+    async fn test_is_model_ready_all_models() {
+        let svc = ModelGatewayServiceImpl::new(test_config());
+        let req = Request::new(IsModelReadyRequest { model_name: None });
+
+        let resp = svc.is_model_ready(req).await.unwrap().into_inner();
+        assert!(resp.ready);
+        assert!(!resp.available_models.is_empty());
+        assert!(resp.available_models.contains(&"llama3.2:3b".to_string()));
+        assert!(resp
+            .available_models
+            .contains(&"nomic-embed-text".to_string()));
+        assert!(resp.error_message.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_is_model_ready_specific_model_found() {
+        let svc = ModelGatewayServiceImpl::new(test_config());
+        let req = Request::new(IsModelReadyRequest {
+            model_name: Some("llama3.2:3b".to_string()),
+        });
+
+        let resp = svc.is_model_ready(req).await.unwrap().into_inner();
+        assert!(resp.ready);
+        assert_eq!(resp.available_models, vec!["llama3.2:3b"]);
+        assert!(resp.error_message.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_is_model_ready_specific_model_not_found() {
+        let svc = ModelGatewayServiceImpl::new(test_config());
+        let req = Request::new(IsModelReadyRequest {
+            model_name: Some("nonexistent:latest".to_string()),
+        });
+
+        let resp = svc.is_model_ready(req).await.unwrap().into_inner();
+        assert!(!resp.ready);
+        assert!(resp.available_models.is_empty());
+        assert!(resp.error_message.is_some());
+    }
+
+    #[tokio::test]
+    async fn test_is_model_ready_with_aliases() {
+        let mut config = test_config();
+        config
+            .routing
+            .aliases
+            .insert("code".into(), "codellama:7b".into());
+
+        let svc = ModelGatewayServiceImpl::new(config);
+        let req = Request::new(IsModelReadyRequest {
+            model_name: Some("codellama:7b".to_string()),
+        });
+
+        let resp = svc.is_model_ready(req).await.unwrap().into_inner();
+        assert!(resp.ready);
+    }
+
+    #[tokio::test]
+    async fn test_stream_inference_unimplemented() {
+        let svc = ModelGatewayServiceImpl::new(test_config());
+        let req = Request::new(StreamInferenceRequest { params: None });
+
+        let result = svc.stream_inference(req).await;
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err().code(), tonic::Code::Unimplemented);
+    }
+
+    #[tokio::test]
+    async fn test_inference_unimplemented() {
+        let svc = ModelGatewayServiceImpl::new(test_config());
+        let req = Request::new(InferenceRequest { params: None });
+
+        let result = svc.inference(req).await;
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err().code(), tonic::Code::Unimplemented);
+    }
+
+    #[tokio::test]
+    async fn test_generate_embedding_unimplemented() {
+        let svc = ModelGatewayServiceImpl::new(test_config());
+        let req = Request::new(GenerateEmbeddingRequest {
+            context: None,
+            text: "test".into(),
+            model: None,
+        });
+
+        let result = svc.generate_embedding(req).await;
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err().code(), tonic::Code::Unimplemented);
+    }
+}