CineGen-CPU / cinegen /video_engine.py
VirtualOasis's picture
init
55b3b1b
from __future__ import annotations
import os
import tempfile
from typing import Dict, List, Optional, Sequence, Tuple
from huggingface_hub import InferenceClient
from .models import SceneBeat, Storyboard
from .placeholders import create_placeholder_video
DEFAULT_VIDEO_MODELS = [
"Wan-AI/Wan2.2-TI2V-5B",
"Lightricks/LTX-Video-0.9.7-distilled",
"tencent/HunyuanVideo-1.5",
"THUDM/CogVideoX-5b",
]
MODEL_PROVIDER_OVERRIDES: Dict[str, Optional[str]] = {
"Wan-AI/Wan2.2-TI2V-5B": "fal-ai",
}
MIN_FRAMES = 16
MAX_FRAMES = 240
FRAMES_PER_SECOND = 8
class VideoDirector:
def __init__(
self,
token: Optional[str] = None,
models: Optional[Sequence[str]] = None,
):
env_token = (
token
or os.environ.get("HF_TOKEN")
or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
or os.environ.get("HUGGING_FACE_HUB_TOKEN")
)
self.token = env_token
self.models = list(models or DEFAULT_VIDEO_MODELS)
def render(self, storyboard: Storyboard) -> Tuple[str, List[str]]:
logs: List[str] = []
clip_paths: List[str] = []
for scene in storyboard.scenes:
video = self._produce_scene(storyboard, scene, logs)
clip_paths.append(video)
final_cut = self._merge_clips(clip_paths, logs)
return final_cut, logs
def _produce_scene(self, storyboard: Storyboard, scene: SceneBeat, logs: List[str]) -> str:
composed_prompt = self._compose_prompt(storyboard, scene)
if self.token:
for model in self.models:
try:
clip = self._call_hf_inference(composed_prompt, model, scene.duration)
logs.append(f"Scene {scene.scene_id}: generated via {model}")
return clip
except Exception as exc:
logs.append(f"Scene {scene.scene_id}: {model} failed ({exc})")
clip = create_placeholder_video(scene, storyboard.style)
logs.append(f"Scene {scene.scene_id}: fallback placeholder clip used.")
return clip
def _call_hf_inference(self, prompt: str, model_id: str, duration: int) -> str:
if not self.token:
raise RuntimeError("Missing Hugging Face token")
client = self._build_client(model_id)
frames = max(MIN_FRAMES, min(MAX_FRAMES, int(duration * FRAMES_PER_SECOND)))
video_bytes = client.text_to_video(
prompt,
model=model_id,
num_frames=frames,
)
tmp_dir = tempfile.mkdtemp(prefix="cinegen-video-")
path = os.path.join(tmp_dir, f"{model_id.split('/')[-1]}.mp4")
with open(path, "wb") as handle:
handle.write(video_bytes)
return path
def _build_client(self, model_id: str) -> InferenceClient:
provider = MODEL_PROVIDER_OVERRIDES.get(model_id)
kwargs = {"token": self.token}
if provider:
kwargs["provider"] = provider
return InferenceClient(**kwargs)
@staticmethod
def _compose_prompt(storyboard: Storyboard, scene: SceneBeat) -> str:
characters = "; ".join(scene.characters)
return (
f"Title: {storyboard.title}. Style: {storyboard.style}. "
f"Scene {scene.scene_id} - {scene.title}: {scene.action} "
f"Visual cues: {scene.visuals}. Mood: {scene.mood}. "
f"Camera: {scene.camera}. Characters: {characters or 'solo sequence'}."
)
def _merge_clips(self, clip_paths: Sequence[str], logs: List[str]) -> str:
try:
from moviepy.editor import VideoFileClip, concatenate_videoclips # type: ignore
except Exception as exc:
logs.append(f"MoviePy unavailable ({exc}); returning first clip only.")
return clip_paths[0]
clips = []
for path in clip_paths:
try:
clip = VideoFileClip(path)
clips.append(clip)
except Exception as exc:
logs.append(f"Failed to read clip {path}: {exc}")
if not clips:
raise RuntimeError("No clips to merge")
final = concatenate_videoclips(clips, method="compose")
tmp_dir = tempfile.mkdtemp(prefix="cinegen-final-")
final_path = os.path.join(tmp_dir, "cinegen_short.mp4")
final.write_videofile(final_path, fps=clips[0].fps, codec="libx264", audio=False, verbose=False, logger=None)
for clip in clips:
clip.close()
logs.append(f"Merged {len(clips)} clips into final cut.")
return final_path