Vision Models
Key Features​
- Image Analysis: Understand and describe the content of images.
- Flexible Input Methods: Supports both image URLs and base64 encoded images.
- Multiple Image Inputs: Analyze multiple images in a single request.
Quick Start​
Images can be provided to the model in two main ways: by passing an image URL or by passing the base64 encoded image directly in the request.
**Currently we support gpt-4o
vision , gpt-4-turbo
vision, Llama 3.2 11B Vision Instruct Turbo and Llama 3.2 90B Vision Instruct Turbo **
Example: What's in this image?
Python Example
import requests
import json
url = "https://api.umamiai.xyz/v1/chat/completions"
payload = json.dumps({
"model": "gpt-4o",
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "What’s in this image?"},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
}
}
]
}
],
"max_tokens": 300
})
headers = {
'Content-Type': 'application/json',
'Authorization': 'Bearer YOUR_UMAMIAI_API_KEY'
}
response = requests.post(url, headers=headers, data=payload)
print(response.json())
Uploading Base64 Encoded Images
For local images, you can pass the base64 encoded image to the model.
Python Example
import base64
import requests
# Function to encode the image
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
# Path to your image
image_path = "path_to_your_image.jpg"
base64_image = encode_image(image_path)
url = "https://api.umamiai.xyz/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": "Bearer YOUR_UMAMIAI_API_KEY"
}
payload = {
"model": "gpt-4o",
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "What’s in this image?"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
]
}
],
"max_tokens": 300
}
response = requests.post(url, headers=headers, json=payload)
print(response.json())
Multiple Image Inputs​
The API can process multiple images in a single request.
Python Example
import requests
import json
url = "https://api.umamiai.xyz/v1/chat/completions"
payload = json.dumps({
"model": "gpt-4o",
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "What are in these images? Is there any difference between them?"},
{"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"}},
{"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"}}
]
}
],
"max_tokens": 300
})
headers = {
'Content-Type': 'application/json',
'Authorization': 'Bearer YOUR_UMAMIAI_API_KEY'
}
response = requests.post(url, headers=headers, data=payload)
print(response.json())
Vision & Chat Completion Example
JavaScript
const main = async () => {
const result = await fetch('https://api.umamiai.xyz/v1/chat/completions', {
method: 'POST',
headers: {
Authorization: 'Bearer <YOUR_UMAMIAI_API_KEY>',
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: 'meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo',
max_tokens: 1024,
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: 'What’s in this image?',
},
{
role: 'user',
type: 'image_url',
image_url: {
url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg',
},
},
],
},
],
}),
}).then((res) => res.json());
const message = result.choices[0].message.content;
console.log(`Assistant: ${message}`);
};
main();
Python
import os
from together import Together
client = Together(base_url="https://api.umamiai.xyz/v1", api_key="<YOUR_UMAMIAI_API_KEY>")
def main():
response = client.chat.completions.create(
model="meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What sort of animal is in this picture? What is its usual diet? What area is the animal native to? And isn’t there some AI model that’s related to the image?",
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/LLama.jpg/444px-LLama.jpg?20050123205659",
},
},
],
}
],
max_tokens=1024,
)
print("Assistant: ", response.choices[0].message.content)
if __name__ == '__main__':
main()