Ernie4.5-VL-Video2Coder

Sleeping

App Files Files Community

Ernie4.5-VL-Video2Coder / app.py

jzhang533

update settings

f162685 20 days ago

raw

history blame

9.16 kB

	import os
	import cv2
	import time
	import uuid
	import base64
	import re
	import gradio as gr
	from openai import OpenAI
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from dotenv import load_dotenv

	# Load environment variables from .env file
	load_dotenv()

	# --- Configuration ---
	QIANFAN_URL = os.getenv("QIANFAN_URL", "")
	QIANFAN_TOKEN = os.getenv("QIANFAN_TOKEN", "")


	MODEL_NAME = "ernie-4.5-turbo-vl"
	MAX_CONCURRENT_REQUESTS = 4
	MAX_VIDEO_DURATION_SEC = 1800

	def extract_frames(video_path, interval_sec=1):
	"""Extract frames from the video."""
	cap = cv2.VideoCapture(video_path)
	if not cap.isOpened():
	return []
	fps = cap.get(cv2.CAP_PROP_FPS)
	if fps == 0: fps = 30
	chunk_frames, chunks, frame_count = [], [], 0
	while cap.isOpened():
	ret, frame = cap.read()
	if not ret: break
	if frame_count % int(fps * interval_sec) == 0:
	height, width = frame.shape[:2]
	scale = 512 / height
	resized_frame = cv2.resize(frame, (int(width * scale), 512))
	_, buffer = cv2.imencode('.jpg', resized_frame, [int(cv2.IMWRITE_JPEG_QUALITY), 60])
	chunk_frames.append(base64.b64encode(buffer).decode('utf-8'))
	if len(chunk_frames) == 30:
	chunks.append(chunk_frames)
	chunk_frames = []
	frame_count += 1
	if chunk_frames: chunks.append(chunk_frames)
	cap.release()
	return chunks

	def process_chunk_with_retry(client, chunk_index, frames_b64, max_retries=3):
	"""Send chunk to LLM."""
	prompt = (
	"This is a segment from a frontend web development video tutorial (screenshots taken every second). "
	"Please focus intently on the code shown on the screen and the resulting web page style. "
	"Describe in detail the HTML structure, CSS styling rules, or JavaScript logic presented in this segment. "
	"Ignore unrelated video elements."
	)
	content = [{"type": "text", "text": prompt}]
	for f in frames_b64:
	content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{f}"}})
	for attempt in range(max_retries):
	try:
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[{"role": "user", "content": content}],
	temperature=0.1, max_tokens=1024
	)
	return chunk_index, response.choices[0].message.content
	except Exception as e:
	time.sleep(2)
	return chunk_index, ""

	def aggregate_and_generate_webpage(client, summaries):
	"""Generate final HTML."""
	full_summary = "\n".join([f"Segment {i+1} Summary: {s}" for i, s in sorted(summaries.items()) if s])
	final_prompt = f"""
	You are an expert Frontend Engineer. Based on the video segment summaries, write a complete HTML file.

	Summaries:
	{full_summary}

	Strict Output Instructions:
	1. Return ONLY the raw HTML code.
	2. Start directly with `<!DOCTYPE html>`.
	3. End directly with `</html>`.
	4. NO introduction text, NO markdown backticks (```), NO explanations after the code.
	"""
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[{"role": "user", "content": final_prompt}],
	temperature=0.2, top_p=0.8
	)
	content = response.choices[0].message.content

	# Regex Cleaning
	content = content.replace("```html", "").replace("```", "").strip()
	match = re.search(r'(<!DOCTYPE html>.*</html>)', content, re.DOTALL \| re.IGNORECASE)
	if match:
	content = match.group(1)
	else:
	start_match = re.search(r'<!DOCTYPE html>', content, re.IGNORECASE)
	if start_match:
	content = content[start_match.start():]

	return content

	def main_process(video_file, progress=gr.Progress()):
	# 1. 初始化状态
	yield "⏳ Initializing...", None, None, None

	api_key = QIANFAN_TOKEN
	if not api_key: raise gr.Error("Server Config Error: API KEY missing.")
	if not video_file: raise gr.Error("Please upload a video.")

	# check duration
	cap = cv2.VideoCapture(video_file)
	fps = cap.get(cv2.CAP_PROP_FPS)
	count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
	duration = count / fps if fps > 0 else 0
	cap.release()
	if duration > MAX_VIDEO_DURATION_SEC:
	raise gr.Error(f"Video too long ({duration/60:.1f}m). Limit is 30m.")

	client = OpenAI(api_key=api_key, base_url=QIANFAN_URL)

	# 2. 抽帧阶段
	yield "🎞️ Step 1/3: Extracting video frames...", None, None, None
	progress(0.1, desc="Extracting frames...")
	chunks = extract_frames(video_file)
	if not chunks: raise gr.Error("Frame extraction failed.")

	# 3. 分析阶段
	yield f"🧠 Step 2/3: ERNIE is analyzing {len(chunks)} segments...", None, None, None
	progress(0.3, desc="Analyzing content...")
	chunk_summaries = {}

	with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_REQUESTS) as executor:
	future_to_chunk = {executor.submit(process_chunk_with_retry, client, i, chunk): i for i, chunk in enumerate(chunks)}

	completed = 0
	total = len(chunks)

	for future in as_completed(future_to_chunk):
	idx, summary = future.result()
	if summary: chunk_summaries[idx] = summary

	completed += 1
	# 实时更新状态文字
	yield f"🧠 Step 2/3: Analyzed segment {completed}/{total}...", None, None, None

	# 4. 生成代码阶段
	yield "✍️ Step 3/3: Synthesizing final HTML code...", None, None, None
	progress(0.85, desc="Synthesizing code...")
	html_code = aggregate_and_generate_webpage(client, chunk_summaries)

	output_path = f"generated_website_{uuid.uuid4().hex}.html"
	with open(output_path, "w", encoding="utf-8") as f:
	f.write(html_code)

	# Create Iframe
	b64_html = base64.b64encode(html_code.encode('utf-8')).decode('utf-8')
	data_uri = f"data:text/html;charset=utf-8;base64,{b64_html}"
	iframe_html = f"""<iframe src="{data_uri}" width="100%" height="600px" style="border: 1px solid #ccc; border-radius: 8px; background-color: white;"></iframe>"""

	progress(1.0, desc="Done")
	# 5. 完成，返回所有结果
	yield "✅ Generation Complete!", iframe_html, output_path, html_code

	# --- UI ---

	with gr.Blocks(title="Ernie 4.5 Video2Code", theme=gr.themes.Soft()) as demo:

	gr.Markdown("# ⚡ ERNIE 4.5-VL: Video to Code Agent")

	# 修复2：将 open 设置为 True，默认展开
	with gr.Accordion("📚 Technical Capabilities of ERNIE 4.5-VL", open=True):
	gr.Markdown("""
	This application is powered by Baidu ERNIE 4.5, a state-of-the-art foundation model with specific enhancements for video understanding:

	* 👁️ Multimodal Heterogeneous MoE: Uses dedicated vision experts to process images and video frames without interfering with text generation capabilities.
	* ⏳ 3D-RoPE Temporal Modeling: Incorporates 3D Rotary Position Embeddings to independently encode temporal, width, and height information.
	* 📐 Adaptive Resolution: Dynamically adjusts to different video aspect ratios to capture fine-grained code details.
	* 🚀 Long Context Window: Supports up to 128k context length for analyzing long tutorials.
	""")

	gr.Markdown("Upload a frontend coding tutorial video. The AI will watch it, understand the code, and render the result instantly.")

	with gr.Row():
	with gr.Column(scale=1):
	# 修复1：去掉了 height 参数，让它自适应高度，不会再“扁扁的”
	video_input = gr.Video(label="Upload Video", format="mp4")

	gr.Examples(
	examples=[["sample_demo.mp4"]],
	inputs=[video_input],
	label="▶️ Or try this example video:",
	cache_examples=False
	)

	submit_btn = gr.Button("🚀 Generate & Render", variant="primary", size="lg")

	# 修复3：新增一个状态文本框，直接显示在这里，不用滚轮找进度条了
	status_output = gr.Textbox(label="Agent Status", value="Ready to start...", interactive=False)

	with gr.Column(scale=2):
	with gr.Tabs():
	with gr.TabItem("🌐 Live Preview (Result)"):
	html_preview = gr.HTML(label="Rendered Page")

	with gr.TabItem("📝 Source Code"):
	code_output = gr.Code(language="html", label="HTML Source")

	with gr.TabItem("⬇️ Download"):
	file_download = gr.File(label="Download .html File")

	# 绑定事件：outputs 增加了 status_output
	submit_btn.click(
	fn=main_process,
	inputs=[video_input],
	outputs=[status_output, html_preview, file_download, code_output]
	)

	if __name__ == "__main__":
	demo.launch()