| import streamlit as st |
| from together import Together |
| import base64 |
| from PIL import Image |
| from io import BytesIO |
| import os |
| from elevenlabs.client import ElevenLabs |
|
|
| |
| os.environ["TOGETHER_API_KEY"] = st.secrets['together_api'] |
| os.environ["ELEVENLABS_API_KEY"] = st.secrets['elevenlabs_api'] |
|
|
| |
| together_client = Together(api_key=os.environ["TOGETHER_API_KEY"]) |
|
|
| |
| elevenlabs_client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"]) |
|
|
| |
| def encode_image(image): |
| buffered = BytesIO() |
| image.save(buffered, format="JPEG") |
| image_bytes = buffered.getvalue() |
| return base64.b64encode(image_bytes).decode('utf-8') |
|
|
| |
| def get_image_description(image): |
| get_description_prompt = "Describe the given image in detail in only 20 words max." |
| |
| |
| base64_image = encode_image(image) |
|
|
| |
| response = together_client.chat.completions.create( |
| model="meta-llama/Llama-Vision-Free", |
| messages=[{ |
| "role": "user", |
| "content": [ |
| {"type": "text", "text": get_description_prompt}, |
| { |
| "type": "image_url", |
| "image_url": { |
| "url": f"data:image/jpeg;base64,{base64_image}", |
| }, |
| }, |
| ], |
| }], |
| stream=False, |
| ) |
|
|
| |
| return response.choices[0].message.content |
|
|
| |
| def tts(text): |
| try: |
| |
| audio_generator = elevenlabs_client.text_to_speech.convert( |
| text=text, |
| voice_id="JBFqnCBsd6RMkjVDRZzb", |
| model_id="eleven_multilingual_v2", |
| output_format="mp3_44100_128", |
| ) |
| |
| |
| audio_file_path = "temp_audio.mp3" |
| with open(audio_file_path, "wb") as f: |
| for chunk in audio_generator: |
| f.write(chunk) |
| |
| |
| st.audio(audio_file_path, format="audio/mp3",autoplay=True) |
| except Exception as e: |
| st.error(f"Error generating speech: {e}") |
|
|
| |
| st.markdown( |
| """ |
| <style> |
| .stApp { |
| background: linear-gradient(135deg, #1e1e2f, #2a2a40); |
| color: #ffffff; |
| font-family: 'Arial', sans-serif; |
| } |
| .stButton>button { |
| background: linear-gradient(135deg, #6a11cb, #2575fc); |
| color: white; |
| border: none; |
| border-radius: 12px; |
| padding: 10px 20px; |
| font-size: 16px; |
| font-weight: bold; |
| } |
| .stButton>button:hover { |
| background: linear-gradient(135deg, #2575fc, #6a11cb); |
| } |
| .stImage { |
| border-radius: 12px; |
| box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2); |
| } |
| .stMarkdown h1 { |
| color: #6a11cb; |
| text-align: center; |
| font-size: 36px; |
| font-weight: bold; |
| } |
| .stMarkdown h2 { |
| color: #2575fc; |
| font-size: 24px; |
| font-weight: bold; |
| } |
| .stSpinner>div { |
| color: #6a11cb; |
| } |
| </style> |
| """, |
| unsafe_allow_html=True, |
| ) |
|
|
| |
| st.title("๐ฎ Visox | Koshur AI") |
| st.markdown("### See the world through AI's eyes!") |
|
|
| |
| st.sidebar.markdown("## About") |
| st.sidebar.markdown("This app uses advanced AI to describe what it sees through your camera in real-time.") |
| st.sidebar.markdown("Powered by [Together AI](https://together.ai) and Streamlit.") |
|
|
| |
| img_file_buffer = st.camera_input("Take a picture") |
|
|
| if img_file_buffer is not None: |
| |
| |
| img = Image.open(img_file_buffer) |
|
|
| |
| st.image(img, caption='Captured Image', width=300) |
|
|
| |
| with st.spinner('๐ Analyzing the image...'): |
| description = get_image_description(img) |
| st.success('โ
Analysis complete!') |
| st.markdown("### AI Description:") |
| st.write(description) |
|
|
| |
| |
| tts(description) |