import base64 import io import json from typing import Dict, Any from PIL import Image from transformers import pipeline class EndpointHandler: """ Custom handler for the ZoeDepth model, fully compliant with the latest Hugging Face Inference Endpoints documentation. The final result is serialized into a single JSON string. """ def __init__(self, path=""): # Initialize the pipeline for depth-estimation self.pipe = pipeline(task="depth-estimation", model=path) print("Depth estimation pipeline initialized successfully.") def __call__(self, data: Dict[str, Any]) -> Dict[str, str]: """ This method is called for every API request. Args: data (Dict): The input data dictionary. Can be PIL Image or bytes. Returns: Dict[str, str]: A dictionary with a single key "generated_text", containing a JSON string of the results. """ # Get image from the request inputs = data.pop("inputs", data) # Handle both PIL Image objects (from image content-type) and bytes (from JSON) if isinstance(inputs, Image.Image): image = inputs else: image = Image.open(io.BytesIO(inputs)) # Pass the image to the pipeline prediction = self.pipe(image) # Extract raw depth data and visual map raw_depth_tensor = prediction["predicted_depth"] raw_depth_data = raw_depth_tensor.cpu().tolist() visual_map_image = prediction["depth"] buffered = io.BytesIO() visual_map_image.save(buffered, format="PNG") visual_map_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") # Create a dictionary to hold all results results = { "raw_depth_data": raw_depth_data, "visual_depth_map": f"data:image/png;base64,{visual_map_base64}" } # Serialize the entire results dictionary into a JSON string json_output_string = json.dumps(results) # Return the final dictionary in the required format return {"generated_text": json_output_string}