Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,15 @@ tracing-subscriber = { version = "0.3", features = [
tracing-futures = { version = "0.2", optional = true }
rstructor_derive = { version = "0.2.8", path = "./rstructor_derive", optional = true }
chrono = "0.4" # For date/time validation in examples
base64 = { version = "0.22", optional = true }
base64 = "0.22"

# Feature flags
[features]
default = ["openai", "anthropic", "grok", "gemini", "derive", "logging"]
openai = ["reqwest", "tokio"]
anthropic = ["reqwest", "tokio"]
grok = ["reqwest", "tokio"]
gemini = ["reqwest", "tokio", "base64"]
gemini = ["reqwest", "tokio"]
derive = ["rstructor_derive"]
logging = ["tracing-subscriber", "tracing-futures"]

Expand Down
23 changes: 14 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,29 +201,28 @@ struct Event {

## Multimodal (Image Input)

Analyze images with structured extraction using Gemini's inline data support:
Analyze images with structured extraction across all major providers using `materialize_with_media`:

```rust
use rstructor::{Instructor, LLMClient, GeminiClient, MediaFile};
use rstructor::{Instructor, LLMClient, OpenAIClient, MediaFile};

#[derive(Instructor, Serialize, Deserialize, Debug)]
struct ImageAnalysis {
subject: String,
colors: Vec<String>,
is_logo: bool,
description: String,
summary: String,
}

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Download or load image bytes
// Download or load image bytes (real-world fixture)
let image_bytes = reqwest::get("https://example.com/image.png")
.await?.bytes().await?;

// Create inline media from bytes (base64-encoded automatically)
// Inline media is base64-encoded automatically
let media = MediaFile::from_bytes(&image_bytes, "image/png");

let client = GeminiClient::from_env()?;
// Works with OpenAI, Anthropic, Grok, and Gemini clients
let client = OpenAIClient::from_env()?;
let analysis: ImageAnalysis = client
.materialize_with_media("Describe this image", &[media])
.await?;
Expand All @@ -232,7 +231,13 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
}
```

`MediaFile::new(uri, mime_type)` is also available for Gemini Files API / GCS URIs.
`MediaFile::new(uri, mime_type)` is also available for URL/URI-based media input.

Provider examples:
- `cargo run --example openai_multimodal_example --features openai`
- `cargo run --example anthropic_multimodal_example --features anthropic`
- `cargo run --example grok_multimodal_example --features grok`
- `cargo run --example gemini_multimodal_example --features gemini`

## Extended Thinking

Expand Down
38 changes: 38 additions & 0 deletions examples/anthropic_multimodal_example.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
//! Anthropic Multimodal Structured Extraction Example
//!
//! Run with:
//! ```bash
//! export ANTHROPIC_API_KEY=your_key_here
//! cargo run --example anthropic_multimodal_example --features anthropic
//! ```

use rstructor::{AnthropicClient, AnthropicModel, Instructor, LLMClient, MediaFile};
use serde::{Deserialize, Serialize};
use std::env;

#[derive(Instructor, Serialize, Deserialize, Debug)]
struct ImageAnalysis {
subject: String,
summary: String,
colors: Vec<String>,
}

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
env::var("ANTHROPIC_API_KEY").expect("Please set ANTHROPIC_API_KEY environment variable");

let image_url = "https://www.rust-lang.org/logos/rust-logo-512x512.png";
let image_bytes = reqwest::get(image_url).await?.bytes().await?;
let media = MediaFile::from_bytes(&image_bytes, "image/png");

let client = AnthropicClient::from_env()?
.model(AnthropicModel::ClaudeOpus46)
.temperature(0.0);

let analysis: ImageAnalysis = client
.materialize_with_media("Describe this image and list dominant colors.", &[media])
.await?;

println!("{:#?}", analysis);
Ok(())
}
38 changes: 38 additions & 0 deletions examples/grok_multimodal_example.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
//! Grok Multimodal Structured Extraction Example
//!
//! Run with:
//! ```bash
//! export XAI_API_KEY=your_key_here
//! cargo run --example grok_multimodal_example --features grok
//! ```

use rstructor::{GrokClient, GrokModel, Instructor, LLMClient, MediaFile};
use serde::{Deserialize, Serialize};
use std::env;

#[derive(Instructor, Serialize, Deserialize, Debug)]
struct ImageAnalysis {
subject: String,
summary: String,
colors: Vec<String>,
}

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
env::var("XAI_API_KEY").expect("Please set XAI_API_KEY environment variable");

let image_url = "https://www.rust-lang.org/logos/rust-logo-512x512.png";
let image_bytes = reqwest::get(image_url).await?.bytes().await?;
let media = MediaFile::from_bytes(&image_bytes, "image/png");

let client = GrokClient::from_env()?
.model(GrokModel::Grok41FastNonReasoning)
.temperature(0.0);

let analysis: ImageAnalysis = client
.materialize_with_media("Describe this image and list dominant colors.", &[media])
.await?;

println!("{:#?}", analysis);
Ok(())
}
38 changes: 38 additions & 0 deletions examples/openai_multimodal_example.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
//! OpenAI Multimodal Structured Extraction Example
//!
//! Run with:
//! ```bash
//! export OPENAI_API_KEY=your_key_here
//! cargo run --example openai_multimodal_example --features openai
//! ```

use rstructor::{Instructor, LLMClient, MediaFile, OpenAIClient, OpenAIModel};
use serde::{Deserialize, Serialize};
use std::env;

#[derive(Instructor, Serialize, Deserialize, Debug)]
struct ImageAnalysis {
subject: String,
summary: String,
colors: Vec<String>,
}

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
env::var("OPENAI_API_KEY").expect("Please set OPENAI_API_KEY environment variable");

let image_url = "https://www.rust-lang.org/logos/rust-logo-512x512.png";
let image_bytes = reqwest::get(image_url).await?.bytes().await?;
let media = MediaFile::from_bytes(&image_bytes, "image/png");

let client = OpenAIClient::from_env()?
.model(OpenAIModel::Gpt52)
.temperature(0.0);

let analysis: ImageAnalysis = client
.materialize_with_media("Describe this image and list dominant colors.", &[media])
.await?;

println!("{:#?}", analysis);
Ok(())
}
48 changes: 39 additions & 9 deletions src/backend/anthropic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ use std::time::Duration;
use tracing::{debug, error, info, instrument, trace, warn};

use crate::backend::{
ChatMessage, GenerateResult, LLMClient, MaterializeInternalOutput, MaterializeResult,
ModelInfo, ThinkingLevel, TokenUsage, ValidationFailureContext, check_response_status,
generate_with_retry_with_history, handle_http_error, parse_validate_and_create_output,
AnthropicMessageContent, ChatMessage, GenerateResult, LLMClient, MaterializeInternalOutput,
MaterializeResult, ModelInfo, ThinkingLevel, TokenUsage, ValidationFailureContext,
build_anthropic_message_content, check_response_status, generate_with_retry_with_history,
handle_http_error, materialize_with_media_with_retry, parse_validate_and_create_output,
prepare_strict_schema,
};
use crate::error::{ApiErrorKind, RStructorError, Result};
Expand Down Expand Up @@ -153,7 +154,7 @@ pub struct AnthropicClient {
#[derive(Debug, Serialize)]
struct AnthropicMessage {
role: String,
content: String,
content: AnthropicMessageContent,
}

/// Output format for structured outputs (native Anthropic structured outputs)
Expand Down Expand Up @@ -351,11 +352,14 @@ impl AnthropicClient {
// With native structured outputs, we don't need to include schema instructions in the prompt
let api_messages: Vec<AnthropicMessage> = messages
.iter()
.map(|msg| AnthropicMessage {
role: msg.role.as_str().to_string(),
content: msg.content.clone(),
.map(|msg| {
Ok(AnthropicMessage {
role: msg.role.as_str().to_string(),
content: build_anthropic_message_content(msg)?,
})
})
.collect();
.collect::<Result<Vec<_>>>()
.map_err(|e| (e, None))?;

// Build thinking config for Claude 4.x models
let is_thinking_model = self.config.model.as_str().contains("sonnet-4")
Expand Down Expand Up @@ -574,6 +578,32 @@ impl LLMClient for AnthropicClient {
Ok(output.data)
}

#[instrument(
name = "anthropic_materialize_with_media",
skip(self, prompt, media),
fields(
type_name = std::any::type_name::<T>(),
model = %self.config.model.as_str(),
prompt_len = prompt.len(),
media_len = media.len()
)
)]
async fn materialize_with_media<T>(&self, prompt: &str, media: &[super::MediaFile]) -> Result<T>
where
T: Instructor + DeserializeOwned + Send + 'static,
{
materialize_with_media_with_retry(
|messages: Vec<ChatMessage>| {
let this = self;
async move { this.materialize_internal::<T>(&messages).await }
},
prompt,
media,
self.config.max_retries,
)
.await
}

#[instrument(
name = "anthropic_materialize_with_metadata",
skip(self, prompt),
Expand Down Expand Up @@ -650,7 +680,7 @@ impl LLMClient for AnthropicClient {
model: self.config.model.as_str().to_string(),
messages: vec![AnthropicMessage {
role: "user".to_string(),
content: prompt.to_string(),
content: AnthropicMessageContent::Text(prompt.to_string()),
}],
temperature: effective_temp,
max_tokens: effective_max_tokens(self.config.max_tokens, thinking_config.as_ref()),
Expand Down
29 changes: 20 additions & 9 deletions src/backend/gemini.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ use tracing::{debug, error, info, instrument, trace, warn};
use crate::backend::{
ChatMessage, GenerateResult, LLMClient, MaterializeInternalOutput, MaterializeResult,
ModelInfo, ThinkingLevel, TokenUsage, ValidationFailureContext, check_response_status,
generate_with_retry_with_history, handle_http_error, parse_validate_and_create_output,
generate_with_retry_with_history, handle_http_error, materialize_with_media_with_retry,
parse_validate_and_create_output,
};
use crate::error::{ApiErrorKind, RStructorError, Result};
use crate::model::Instructor;
Expand Down Expand Up @@ -50,12 +51,16 @@ pub enum Model {
Gemini25Flash,
/// Gemini 2.5 Flash Lite (smaller, faster variant)
Gemini25FlashLite,
/// Gemini 2.5 Flash Image (image generation/analysis tuned variant)
Gemini25FlashImage,
/// Gemini 2.0 Flash (stable 2.0 Flash model)
Gemini20Flash,
/// Gemini 2.0 Flash 001 (specific version of 2.0 Flash)
Gemini20Flash001,
/// Gemini 2.0 Flash Lite (smaller 2.0 Flash variant)
Gemini20FlashLite,
/// Gemini 2.0 Flash Lite 001 (specific version of 2.0 Flash Lite)
Gemini20FlashLite001,
/// Gemini Pro Latest (alias for latest Pro model)
GeminiProLatest,
/// Gemini Flash Latest (alias for latest Flash model)
Expand All @@ -74,9 +79,11 @@ impl Model {
Model::Gemini25Pro => "gemini-2.5-pro",
Model::Gemini25Flash => "gemini-2.5-flash",
Model::Gemini25FlashLite => "gemini-2.5-flash-lite",
Model::Gemini25FlashImage => "gemini-2.5-flash-image",
Model::Gemini20Flash => "gemini-2.0-flash",
Model::Gemini20Flash001 => "gemini-2.0-flash-001",
Model::Gemini20FlashLite => "gemini-2.0-flash-lite",
Model::Gemini20FlashLite001 => "gemini-2.0-flash-lite-001",
Model::GeminiProLatest => "gemini-pro-latest",
Model::GeminiFlashLatest => "gemini-flash-latest",
Model::GeminiFlashLiteLatest => "gemini-flash-lite-latest",
Expand All @@ -96,9 +103,11 @@ impl Model {
"gemini-2.5-pro" => Model::Gemini25Pro,
"gemini-2.5-flash" => Model::Gemini25Flash,
"gemini-2.5-flash-lite" => Model::Gemini25FlashLite,
"gemini-2.5-flash-image" => Model::Gemini25FlashImage,
"gemini-2.0-flash" => Model::Gemini20Flash,
"gemini-2.0-flash-001" => Model::Gemini20Flash001,
"gemini-2.0-flash-lite" => Model::Gemini20FlashLite,
"gemini-2.0-flash-lite-001" => Model::Gemini20FlashLite001,
"gemini-pro-latest" => Model::GeminiProLatest,
"gemini-flash-latest" => Model::GeminiFlashLatest,
"gemini-flash-lite-latest" => Model::GeminiFlashLiteLatest,
Expand Down Expand Up @@ -652,14 +661,16 @@ impl LLMClient for GeminiClient {
where
T: Instructor + DeserializeOwned + Send + 'static,
{
// For media support, we need to create a ChatMessage with media and pass it directly
// We can't use generate_with_retry_with_history since it only takes a string prompt
let initial_message = ChatMessage::user_with_media(prompt, media.to_vec());
let output = self
.materialize_internal::<T>(&[initial_message])
.await
.map_err(|(err, _)| err)?;
Ok(output.data)
materialize_with_media_with_retry(
|messages: Vec<ChatMessage>| {
let this = self;
async move { this.materialize_internal::<T>(&messages).await }
},
prompt,
media,
self.config.max_retries,
)
.await
}

#[instrument(
Expand Down
Loading