Z-Image-Turbo

Running on Zero

App Files Files Community

Z-Image-Turbo / app.py

cpuai

Update app.py

8616178 verified 8 days ago

raw

history blame contribute delete

38 kB

	import os
	import sys
	import re
	import json
	import random
	import logging
	import warnings
	import traceback
	import threading
	from dataclasses import dataclass
	from typing import Any, Dict, List, Optional, Tuple

	import gradio as gr
	import torch
	from PIL import Image, ImageDraw, ImageFont

	# ==================== spaces 兼容处理 ====================
	# 在 HuggingFace Spaces 上会有 spaces 包；
	# 本地运行时如果没有 spaces，也不会直接崩溃。
	try:
	import spaces # type: ignore
	except Exception:
	class _SpacesFallback:
	@staticmethod
	def GPU(fn=None, **kwargs):
	if fn is None:
	return lambda f: f
	return fn

	@staticmethod
	def aoti_blocks_load(args, *kwargs):
	raise RuntimeError("spaces.aoti_blocks_load is unavailable outside HuggingFace Spaces.")

	spaces = _SpacesFallback() # type: ignore


	from diffusers import (
	AutoencoderKL,
	DiffusionPipeline,
	FlowMatchEulerDiscreteScheduler,
	)

	from transformers import AutoModelForCausalLM, AutoTokenizer


	# ------------------------- 可选依赖：Prompt Enhancer 模板 -------------------------
	# 如果你的工程里有 pe.py，会自动使用；
	# 没有也不会报错，Prompt Enhance 默认关闭。
	try:
	sys.path.append(os.path.dirname(os.path.abspath(__file__)))
	from pe import prompt_template # type: ignore
	except Exception:
	prompt_template = (
	"You are a helpful prompt engineer. Expand the user prompt into a richer, detailed prompt. "
	"Return JSON with key revised_prompt."
	)


	# ==================== Environment Variables ====================
	MODEL_PATH = os.environ.get("MODEL_PATH", "Tongyi-MAI/Z-Image-Turbo")

	# 关键修复：
	# 1. 默认关闭 compile，避免首轮加载超时、编译失败、ZeroGPU 兼容问题。
	# 2. 如确认环境稳定，可在 Space Variables 中设置 ENABLE_COMPILE=true。
	ENABLE_COMPILE = os.environ.get("ENABLE_COMPILE", "false").lower() == "true"

	# 关键修复：
	# 默认关闭 warmup。原代码会遍历大量分辨率进行预热，非常容易导致启动失败。
	ENABLE_WARMUP = os.environ.get("ENABLE_WARMUP", "false").lower() == "true"

	# 默认 native 最稳。若你的环境确认支持 flash_3，可设置 ATTENTION_BACKEND=flash_3。
	ATTENTION_BACKEND = os.environ.get("ATTENTION_BACKEND", "native")

	# ZeroGPU AoTI：默认尝试启用，但失败不会影响主流程。
	ENABLE_AOTI = os.environ.get("ENABLE_AOTI", "true").lower() == "true"

	# Safety checker 会额外占用内存，默认关闭，防止把主模型加载拖死。
	# 如需要可设置 ENABLE_SAFETY_CHECKER=true。
	ENABLE_SAFETY_CHECKER = os.environ.get("ENABLE_SAFETY_CHECKER", "false").lower() == "true"

	# 优先使用 DiffusionPipeline 加载；失败后再回退到手动组件加载。
	USE_DIFFUSION_PIPELINE = os.environ.get("USE_DIFFUSION_PIPELINE", "true").lower() == "true"

	# 生成历史图片数量，避免 Gallery 越堆越多占内存。
	MAX_GALLERY_HISTORY = int(os.environ.get("MAX_GALLERY_HISTORY", "8"))

	DASHSCOPE_API_KEY = os.environ.get("DASHSCOPE_API_KEY")
	HF_TOKEN = os.environ.get("HF_TOKEN")
	# ===============================================================


	os.environ["TOKENIZERS_PARALLELISM"] = "false"
	warnings.filterwarnings("ignore")
	logging.getLogger("transformers").setLevel(logging.ERROR)


	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32

	pipe = None
	prompt_expander = None
	model_lock = threading.Lock()
	MODEL_LOAD_ERROR = ""


	RES_CHOICES = {
	"1024": [
	"1024x1024 ( 1:1 )",
	"1152x896 ( 9:7 )",
	"896x1152 ( 7:9 )",
	"1152x864 ( 4:3 )",
	"864x1152 ( 3:4 )",
	"1248x832 ( 3:2 )",
	"832x1248 ( 2:3 )",
	"1280x720 ( 16:9 )",
	"720x1280 ( 9:16 )",
	"1344x576 ( 21:9 )",
	"576x1344 ( 9:21 )",
	],
	"1280": [
	"1280x1280 ( 1:1 )",
	"1440x1120 ( 9:7 )",
	"1120x1440 ( 7:9 )",
	"1472x1104 ( 4:3 )",
	"1104x1472 ( 3:4 )",
	"1536x1024 ( 3:2 )",
	"1024x1536 ( 2:3 )",
	"1536x864 ( 16:9 )",
	"864x1536 ( 9:16 )",
	"1680x720 ( 21:9 )",
	"720x1680 ( 9:21 )",
	],
	"1536": [
	"1536x1536 ( 1:1 )",
	"1728x1344 ( 9:7 )",
	"1344x1728 ( 7:9 )",
	"1728x1296 ( 4:3 )",
	"1296x1728 ( 3:4 )",
	"1872x1248 ( 3:2 )",
	"1248x1872 ( 2:3 )",
	"2048x1152 ( 16:9 )",
	"1152x2048 ( 9:16 )",
	"2016x864 ( 21:9 )",
	"864x2016 ( 9:21 )",
	],
	}

	RESOLUTION_SET: List[str] = []
	for _k, _items in RES_CHOICES.items():
	RESOLUTION_SET.extend(_items)

	EXAMPLE_PROMPTS = [
	["一位男士和他的贵宾犬穿着配套的服装参加狗狗秀，室内灯光，背景中有观众。"],
	["极具氛围感的暗调人像，一位优雅的中国美女在黑暗的房间里。一束强光通过遮光板，在她的脸上投射出一个清晰的闪电形状的光影，正好照亮一只眼睛。高对比度，明暗交界清晰，神秘感，莱卡相机色调。"],
	]


	def refresh_runtime_device() -> Tuple[str, torch.dtype]:
	"""
	关键修复：
	在 ZeroGPU 环境中，应用启动阶段可能没有 CUDA；
	只有进入 @spaces.GPU 函数后，CUDA 才可能可见。
	因此必须在生成函数内部重新判断 DEVICE / DTYPE。
	"""
	global DEVICE, DTYPE

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32
	print(f"[Runtime] DEVICE={DEVICE}, DTYPE={DTYPE}")
	return DEVICE, DTYPE


	def cuda_cleanup():
	"""
	出错后尽量释放 CUDA 缓存，避免后续请求继续失败。
	"""
	try:
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.ipc_collect()
	except Exception:
	pass


	def is_local_model_path(model_path: str) -> bool:
	return os.path.isdir(model_path)


	def hf_token_candidates() -> List[Dict[str, str]]:
	"""
	兼容不同版本的 transformers / diffusers。
	新版本一般使用 token；
	旧版本可能使用 use_auth_token。
	不要同时传 token 和 use_auth_token，否则部分环境会报错。
	"""
	if not HF_TOKEN:
	return [{}]
	return [{"token": HF_TOKEN}, {"use_auth_token": HF_TOKEN}, {}]


	def get_resolution(resolution: str) -> Tuple[int, int]:
	match = re.search(r"(\d+)\s[×x]\s(\d+)", str(resolution))
	if match:
	return int(match.group(1)), int(match.group(2))
	return 1024, 1024


	def _make_blocked_image(width=1024, height=1024, text="Blocked by Safety Checker") -> Image.Image:
	img = Image.new("RGB", (width, height), (20, 20, 20))
	draw = ImageDraw.Draw(img)
	try:
	font = ImageFont.load_default()
	except Exception:
	font = None

	draw.rectangle([0, 0, width, 90], fill=(160, 0, 0))
	draw.text((20, 30), text, fill=(255, 255, 255), font=font)
	return img


	def _load_nsfw_placeholder(width=1024, height=1024) -> Image.Image:
	"""
	命中 NSFW 时优先加载工作目录的 nsfw.png；
	不存在就生成一张占位图，避免文件缺失导致再次报错。
	"""
	if os.path.exists("nsfw.png"):
	try:
	return Image.open("nsfw.png").convert("RGB")
	except Exception:
	pass
	return _make_blocked_image(width, height, "NSFW blocked")


	def _move_pipeline_to_device(p) -> Any:
	"""
	兼容不同 diffusers 版本的 .to() 调用方式。
	"""
	if p is None:
	return p

	# 如果使用 device_map 加载，通常不要再强行 .to()
	if getattr(p, "hf_device_map", None):
	print(f"[Init] Pipeline already has hf_device_map: {getattr(p, 'hf_device_map', None)}")
	return p

	if DEVICE == "cuda":
	attempts = [
	lambda: p.to("cuda"),
	lambda: p.to(torch_dtype=DTYPE),
	lambda: p.to(device="cuda"),
	lambda: p.to("cuda", torch_dtype=DTYPE),
	]
	else:
	attempts = [
	lambda: p.to("cpu"),
	lambda: p.to(torch_dtype=torch.float32),
	lambda: p.to(device="cpu"),
	]

	last_error = None
	for fn in attempts:
	try:
	p = fn()
	return p
	except Exception as e:
	last_error = e

	print(f"[Init] Warning: pipeline.to(...) failed, continue anyway. Error: {last_error}")
	return p


	def _set_attention_backend_if_possible(p, backend: str) -> None:
	"""
	attention backend 不是所有环境都支持。
	失败时自动回退 native，仍失败也不阻塞主流程。
	"""
	if not p:
	return

	transformer = getattr(p, "transformer", None)
	if transformer is None:
	return

	if not hasattr(transformer, "set_attention_backend"):
	print("[Init] Transformer has no set_attention_backend method, skip.")
	return

	try:
	transformer.set_attention_backend(backend)
	print(f"[Init] Attention backend set to: {backend}")
	return
	except Exception as e:
	print(f"[Init] set_attention_backend('{backend}') failed: {e}")

	try:
	transformer.set_attention_backend("native")
	print("[Init] Attention backend fallback to: native")
	except Exception as e:
	print(f"[Init] set_attention_backend('native') also failed, ignored: {e}")


	def _compile_transformer_if_possible(p) -> Any:
	"""
	torch.compile 可能加速，也可能导致首轮非常慢或直接失败。
	因此默认关闭，且失败时不影响主流程。
	"""
	if not ENABLE_COMPILE:
	return p

	if DEVICE != "cuda":
	print("[Init] ENABLE_COMPILE=true but DEVICE is not cuda, skip compile.")
	return p

	transformer = getattr(p, "transformer", None)
	if transformer is None:
	print("[Init] No transformer found, skip compile.")
	return p

	try:
	print("[Init] Enabling torch.compile optimizations...")
	torch._inductor.config.conv_1x1_as_mm = True
	torch._inductor.config.coordinate_descent_tuning = True
	torch._inductor.config.epilogue_fusion = False
	torch._inductor.config.coordinate_descent_check_all_directions = True
	torch._inductor.config.max_autotune_gemm = True
	torch._inductor.config.max_autotune_gemm_backends = "TRITON,ATEN"
	torch._inductor.config.triton.cudagraphs = False

	p.transformer = torch.compile(
	transformer,
	mode="max-autotune-no-cudagraphs",
	fullgraph=False,
	)
	print("[Init] Transformer compiled.")
	except Exception:
	print("[Init] torch.compile failed, continue without compile:")
	traceback.print_exc()

	return p


	def try_enable_aoti(p) -> None:
	"""
	AoTI / ZeroGPU 加速。
	可用则启用，不可用则跳过。
	"""
	if not ENABLE_AOTI:
	print("[Init] ENABLE_AOTI=false, skip AoTI.")
	return

	if p is None:
	return

	try:
	transformer = getattr(p, "transformer", None)
	if transformer is None:
	print("[Init] No transformer found, skip AoTI.")
	return

	target = None
	if hasattr(transformer, "layers"):
	target = transformer.layers
	if hasattr(target, "_repeated_blocks"):
	target._repeated_blocks = ["ZImageTransformerBlock"]
	else:
	target = transformer
	if hasattr(target, "_repeated_blocks"):
	target._repeated_blocks = ["ZImageTransformerBlock"]

	if target is not None:
	spaces.aoti_blocks_load(target, "zerogpu-aoti/Z-Image", variant="fa3")
	print("[Init] AoTI blocks loaded.")
	except Exception:
	print("[Init] AoTI not enabled, safe to ignore:")
	traceback.print_exc()


	def _load_safety_checker_if_enabled(p) -> Any:
	"""
	Safety checker 默认关闭，因为它会额外占用内存。
	即使开启，加载失败也不影响主模型。
	"""
	if not ENABLE_SAFETY_CHECKER:
	print("[Init] ENABLE_SAFETY_CHECKER=false, skip safety checker.")
	try:
	p.safety_feature_extractor = None
	p.safety_checker = None
	except Exception:
	pass
	return p

	try:
	from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker

	try:
	from transformers import CLIPImageProcessor as _CLIPProcessor
	except Exception:
	from transformers import CLIPFeatureExtractor as _CLIPProcessor # type: ignore

	safety_model_id = "CompVis/stable-diffusion-safety-checker"
	last_error = None

	for token_kwargs in hf_token_candidates():
	try:
	safety_feature_extractor = _CLIPProcessor.from_pretrained(
	safety_model_id,
	**token_kwargs,
	)
	safety_checker = StableDiffusionSafetyChecker.from_pretrained(
	safety_model_id,
	torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
	**token_kwargs,
	)
	safety_checker = safety_checker.to(DEVICE)

	p.safety_feature_extractor = safety_feature_extractor
	p.safety_checker = safety_checker
	print("[Init] Safety checker loaded.")
	return p
	except Exception as e:
	last_error = e

	raise RuntimeError(f"Safety checker load failed: {last_error}")

	except Exception:
	print("[Init] Safety checker init failed. NSFW filtering will be skipped:")
	traceback.print_exc()
	try:
	p.safety_feature_extractor = None
	p.safety_checker = None
	except Exception:
	pass
	return p


	def _load_with_diffusion_pipeline(model_path: str) -> Any:
	"""
	优先使用官方推荐的 DiffusionPipeline 加载方式。
	做多组参数尝试，以兼容 diffusers 新旧版本。
	"""
	print("[Init] Trying DiffusionPipeline loading strategy...")

	local = is_local_model_path(model_path)
	token_candidates = [{}] if local else hf_token_candidates()

	dtype_candidates: List[Dict[str, Any]] = []

	if DEVICE == "cuda":
	dtype_candidates.extend([
	# 新版 diffusers 某些文档使用 dtype
	{"dtype": DTYPE, "device_map": "cuda"},
	{"dtype": DTYPE},

	# 旧版常用 torch_dtype
	{"torch_dtype": DTYPE, "device_map": "cuda"},
	{"torch_dtype": DTYPE},

	# 某些 Z-Image 示例需要 low_cpu_mem_usage=False
	{"torch_dtype": DTYPE, "low_cpu_mem_usage": False},
	{"dtype": DTYPE, "low_cpu_mem_usage": False},

	# 兼容 custom pipeline / older discussions
	{"torch_dtype": DTYPE, "trust_remote_code": True},
	{"dtype": DTYPE, "trust_remote_code": True},
	])
	else:
	dtype_candidates.extend([
	{"torch_dtype": torch.float32},
	{"dtype": torch.float32},
	{"torch_dtype": torch.float32, "low_cpu_mem_usage": False},
	{},
	])

	errors: List[str] = []

	for token_kwargs in token_candidates:
	for extra_kwargs in dtype_candidates:
	kwargs: Dict[str, Any] = {}
	kwargs.update(token_kwargs)
	kwargs.update(extra_kwargs)

	try:
	print(f"[Init] DiffusionPipeline.from_pretrained kwargs={list(kwargs.keys())}")
	p = DiffusionPipeline.from_pretrained(model_path, **kwargs)
	print("[Init] DiffusionPipeline loaded.")
	p = _move_pipeline_to_device(p)
	return p
	except Exception as e:
	err = f"kwargs={kwargs} -> {type(e).__name__}: {e}"
	print(f"[Init] DiffusionPipeline attempt failed: {err}")
	errors.append(err)

	raise RuntimeError(
	"All DiffusionPipeline loading attempts failed.\n"
	+ "\n".join(errors[-8:])
	)


	def _from_pretrained_component(cls, path_or_repo: str, subfolder: Optional[str], torch_dtype: Optional[torch.dtype]) -> Any:
	"""
	手动组件加载的兼容封装。
	"""
	local = is_local_model_path(path_or_repo)

	if local:
	load_path = os.path.join(path_or_repo, subfolder) if subfolder else path_or_repo
	kwargs: Dict[str, Any] = {}
	if torch_dtype is not None:
	kwargs["torch_dtype"] = torch_dtype
	return cls.from_pretrained(load_path, **kwargs)

	errors = []
	for token_kwargs in hf_token_candidates():
	kwargs = {}
	if subfolder:
	kwargs["subfolder"] = subfolder
	if torch_dtype is not None:
	kwargs["torch_dtype"] = torch_dtype
	kwargs.update(token_kwargs)

	try:
	return cls.from_pretrained(path_or_repo, **kwargs)
	except Exception as e:
	errors.append(f"{type(e).__name__}: {e}")

	raise RuntimeError(
	f"Failed to load component {cls} subfolder={subfolder}. "
	+ " \| ".join(errors[-4:])
	)


	def _load_with_manual_components(model_path: str) -> Any:
	"""
	回退方案：按你原来的方式手动加载 VAE、text_encoder、tokenizer、transformer。
	如果 diffusers 环境里没有 ZImagePipeline / ZImageTransformer2DModel，会在这里给出明确错误。
	"""
	print("[Init] Trying manual component loading strategy...")

	try:
	from diffusers import ZImagePipeline # type: ignore
	from diffusers.models.transformers.transformer_z_image import ZImageTransformer2DModel # type: ignore
	except Exception as e:
	raise RuntimeError(
	"Current diffusers does not provide ZImagePipeline / ZImageTransformer2DModel. "
	"Please upgrade diffusers, for example: pip install -U diffusers transformers accelerate"
	) from e

	model_dtype = DTYPE if DEVICE == "cuda" else torch.float32

	vae = _from_pretrained_component(
	AutoencoderKL,
	model_path,
	"vae",
	model_dtype,
	)

	text_encoder = _from_pretrained_component(
	AutoModelForCausalLM,
	model_path,
	"text_encoder",
	model_dtype,
	).eval()

	tokenizer = _from_pretrained_component(
	AutoTokenizer,
	model_path,
	"tokenizer",
	None,
	)
	tokenizer.padding_side = "left"

	p = ZImagePipeline(
	scheduler=None,
	vae=vae,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	transformer=None,
	)

	transformer = _from_pretrained_component(
	ZImageTransformer2DModel,
	model_path,
	"transformer",
	None,
	)

	transformer = transformer.to(DEVICE, DTYPE)
	p.transformer = transformer
	p = _move_pipeline_to_device(p)

	print("[Init] Manual component loading finished.")
	return p


	def load_models(model_path: str) -> Any:
	"""
	统一模型加载入口。
	先尝试 DiffusionPipeline，失败后回退手动组件加载。
	"""
	print("=" * 80)
	print(f"[Init] Loading model from: {model_path}")
	print(f"[Init] DEVICE={DEVICE}, DTYPE={DTYPE}")
	print(f"[Init] USE_DIFFUSION_PIPELINE={USE_DIFFUSION_PIPELINE}")
	print(f"[Init] ENABLE_COMPILE={ENABLE_COMPILE}")
	print(f"[Init] ENABLE_WARMUP={ENABLE_WARMUP}")
	print(f"[Init] ATTENTION_BACKEND={ATTENTION_BACKEND}")
	print("=" * 80)

	last_error = None

	if USE_DIFFUSION_PIPELINE:
	try:
	p = _load_with_diffusion_pipeline(model_path)
	_set_attention_backend_if_possible(p, ATTENTION_BACKEND)
	p = _compile_transformer_if_possible(p)
	p = _load_safety_checker_if_enabled(p)
	return p
	except Exception as e:
	last_error = e
	print("[Init] DiffusionPipeline strategy failed:")
	traceback.print_exc()
	cuda_cleanup()

	try:
	p = _load_with_manual_components(model_path)
	_set_attention_backend_if_possible(p, ATTENTION_BACKEND)
	p = _compile_transformer_if_possible(p)
	p = _load_safety_checker_if_enabled(p)
	return p
	except Exception as e:
	print("[Init] Manual component strategy failed:")
	traceback.print_exc()
	cuda_cleanup()

	raise RuntimeError(
	"Model loading failed in all strategies. "
	f"First error: {last_error}. "
	f"Second error: {e}"
	) from e


	def generate_image(
	p,
	prompt: str,
	resolution: str = "1024x1024",
	seed: int = 42,
	guidance_scale: float = 0.0,
	num_inference_steps: int = 9,
	shift: float = 3.0,
	max_sequence_length: int = 512,
	) -> Image.Image:
	"""
	单张图片生成。
	"""
	width, height = get_resolution(resolution)

	if DEVICE == "cuda":
	generator = torch.Generator(device="cuda").manual_seed(int(seed))
	else:
	generator = torch.Generator().manual_seed(int(seed))

	# Z-Image-Turbo 常用 FlowMatchEulerDiscreteScheduler
	try:
	p.scheduler = FlowMatchEulerDiscreteScheduler(
	num_train_timesteps=1000,
	shift=float(shift),
	)
	except Exception:
	print("[Generate] Failed to assign scheduler, continue with existing scheduler:")
	traceback.print_exc()

	call_kwargs = dict(
	prompt=prompt,
	height=int(height),
	width=int(width),
	guidance_scale=float(guidance_scale),
	num_inference_steps=int(num_inference_steps),
	generator=generator,
	max_sequence_length=int(max_sequence_length),
	)

	# 不同 pipeline 版本参数支持可能不同，失败后去掉 max_sequence_length 再试。
	try:
	out = p(**call_kwargs)
	except TypeError:
	call_kwargs.pop("max_sequence_length", None)
	out = p(**call_kwargs)

	image = out.images[0]
	if not isinstance(image, Image.Image):
	image = Image.fromarray(image)

	return image.convert("RGB")


	def warmup_model(p) -> None:
	"""
	极简 warmup。
	原代码遍历全部分辨率，每个分辨率生成两张，风险很高。
	这里仅在用户显式开启 ENABLE_WARMUP=true 时，对 1024x1024 跑一次短步数。
	"""
	if not ENABLE_WARMUP:
	return

	try:
	print("[Warmup] Starting minimal warmup...")
	generate_image(
	p,
	prompt="warmup",
	resolution="1024x1024",
	num_inference_steps=2,
	guidance_scale=0.0,
	seed=42,
	)
	print("[Warmup] Completed.")
	except Exception:
	print("[Warmup] Failed, ignored:")
	traceback.print_exc()
	cuda_cleanup()


	# ==================== Prompt Expander ====================
	@dataclass
	class PromptOutput:
	status: bool
	prompt: str
	seed: int
	system_prompt: str
	message: str


	class PromptExpander:
	def __init__(self, backend="api", **kwargs):
	self.backend = backend

	def decide_system_prompt(self, template_name=None):
	return prompt_template


	class APIPromptExpander(PromptExpander):
	def __init__(self, api_config=None, **kwargs):
	super().__init__(backend="api", **kwargs)
	self.api_config = api_config or {}
	self.client = self._init_api_client()

	def _init_api_client(self):
	try:
	from openai import OpenAI

	api_key = self.api_config.get("api_key") or DASHSCOPE_API_KEY
	base_url = self.api_config.get(
	"base_url",
	"https://dashscope.aliyuncs.com/compatible-mode/v1",
	)

	if not api_key:
	print("[PE] Warning: DASHSCOPE_API_KEY not found. Prompt enhance unavailable.")
	return None

	return OpenAI(api_key=api_key, base_url=base_url)

	except ImportError:
	print("[PE] openai package not installed. Prompt enhance unavailable.")
	return None
	except Exception:
	print("[PE] Failed to initialize API client:")
	traceback.print_exc()
	return None

	def __call__(self, prompt, system_prompt=None, seed=-1, **kwargs):
	return self.extend(prompt, system_prompt, seed, **kwargs)

	def extend(self, prompt, system_prompt=None, seed=-1, **kwargs):
	if self.client is None:
	return PromptOutput(False, "", seed, system_prompt or "", "API client not initialized")

	if system_prompt is None:
	system_prompt = self.decide_system_prompt()

	if "{prompt}" in system_prompt:
	system_prompt = system_prompt.format(prompt=prompt)
	prompt = " "

	try:
	model = self.api_config.get("model", "qwen3-max-preview")
	response = self.client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": prompt},
	],
	temperature=0.7,
	top_p=0.8,
	)

	content = response.choices[0].message.content or ""
	expanded_prompt = content

	json_start = content.find("```json")
	if json_start != -1:
	json_end = content.find("```", json_start + 7)
	if json_end != -1:
	json_str = content[json_start + 7: json_end].strip()
	try:
	data = json.loads(json_str)
	expanded_prompt = data.get("revised_prompt", content)
	except Exception:
	expanded_prompt = content

	return PromptOutput(True, expanded_prompt, seed, system_prompt, content)

	except Exception as e:
	return PromptOutput(False, "", seed, system_prompt, str(e))


	def create_prompt_expander(backend="api", **kwargs):
	if backend == "api":
	return APIPromptExpander(**kwargs)
	raise ValueError("Only 'api' backend is supported.")


	def get_or_create_prompt_expander():
	"""
	Prompt enhancer 懒加载，避免启动阶段因 openai / key 问题影响主程序。
	"""
	global prompt_expander

	if prompt_expander is not None:
	return prompt_expander

	try:
	prompt_expander = create_prompt_expander(
	backend="api",
	api_config={"model": "qwen3-max-preview"},
	)
	print("[PE] Prompt expander ready.")
	except Exception:
	print("[PE] Prompt expander init failed:")
	traceback.print_exc()
	prompt_expander = None

	return prompt_expander


	def prompt_enhance(prompt: str, enable_enhance: bool) -> Tuple[str, str]:
	if not enable_enhance:
	return prompt, "Enhancement disabled."

	expander = get_or_create_prompt_expander()
	if not expander:
	return prompt, "Prompt expander unavailable."

	if not prompt.strip():
	return "", "Please enter a prompt."

	try:
	result = expander(prompt)
	if result.status:
	return result.prompt, result.message
	return prompt, f"Enhancement failed: {result.message}"
	except Exception as e:
	return prompt, f"Error: {str(e)}"


	def get_or_load_pipe():
	"""
	关键修复：
	不在应用启动阶段加载大模型；
	只在 Generate 点击后进入 @spaces.GPU 环境时懒加载。
	"""
	global pipe, MODEL_LOAD_ERROR

	refresh_runtime_device()

	if pipe is not None:
	return pipe

	with model_lock:
	if pipe is not None:
	return pipe

	try:
	MODEL_LOAD_ERROR = ""
	loaded_pipe = load_models(MODEL_PATH)

	try_enable_aoti(loaded_pipe)
	warmup_model(loaded_pipe)

	pipe = loaded_pipe
	print("[Init] Model loaded successfully.")
	return pipe

	except Exception:
	MODEL_LOAD_ERROR = traceback.format_exc()
	print("[Init] Model loading failed with full traceback:")
	print(MODEL_LOAD_ERROR)
	pipe = None
	cuda_cleanup()

	raise gr.Error(
	"Model loading failed. Please open the Space logs to view the full traceback. "
	"Common fixes: upgrade diffusers/transformers/accelerate, disable compile/warmup, "
	"or check MODEL_PATH / HF_TOKEN."
	)


	def normalize_gallery_items(gallery_images) -> List[Any]:
	"""
	兼容 Gradio Gallery 在不同版本下返回的格式。
	"""
	if not gallery_images:
	return []

	result = []
	for item in list(gallery_images):
	try:
	if isinstance(item, Image.Image):
	result.append(item)
	elif isinstance(item, str) and os.path.exists(item):
	result.append(Image.open(item).convert("RGB"))
	elif isinstance(item, (tuple, list)) and len(item) > 0:
	first = item[0]
	if isinstance(first, Image.Image):
	result.append(first)
	elif isinstance(first, str) and os.path.exists(first):
	result.append(Image.open(first).convert("RGB"))
	elif isinstance(item, dict):
	img_obj = item.get("image") or item.get("path") or item.get("name")
	if isinstance(img_obj, Image.Image):
	result.append(img_obj)
	elif isinstance(img_obj, str) and os.path.exists(img_obj):
	result.append(Image.open(img_obj).convert("RGB"))
	except Exception:
	continue

	return result[: max(0, MAX_GALLERY_HISTORY - 1)]


	def run_safety_check_if_available(p, image: Image.Image, width: int, height: int) -> Image.Image:
	"""
	生成后安全检查。
	默认不会启用，因为 ENABLE_SAFETY_CHECKER=false。
	"""
	try:
	if getattr(p, "safety_feature_extractor", None) is None:
	return image
	if getattr(p, "safety_checker", None) is None:
	return image

	import numpy as np

	clip_inputs = p.safety_feature_extractor([image], return_tensors="pt")
	clip_input = clip_inputs.pixel_values.to(DEVICE)

	img_np = np.array(image).astype("float32") / 255.0
	img_np = img_np[None, ...]

	_checked_images, has_nsfw = p.safety_checker(
	images=img_np,
	clip_input=clip_input,
	)

	if isinstance(has_nsfw, (list, tuple)) and len(has_nsfw) > 0 and bool(has_nsfw[0]):
	return _load_nsfw_placeholder(width, height)

	return image

	except Exception:
	print("[Safety] Check failed, ignored:")
	traceback.print_exc()
	return image


	@spaces.GPU
	def generate(
	prompt,
	resolution="1024x1024 ( 1:1 )",
	seed=42,
	steps=8,
	shift=3.0,
	random_seed=True,
	gallery_images=None,
	enhance=False,
	progress=gr.Progress(track_tqdm=True),
	):
	"""
	Gradio 生成入口。
	这个函数在 ZeroGPU 环境中会触发动态 GPU 分配。
	"""
	try:
	if not str(prompt or "").strip():
	raise gr.Error("Please enter a prompt.")

	current_pipe = get_or_load_pipe()

	if random_seed:
	new_seed = random.randint(1, 1_000_000)
	else:
	try:
	new_seed = int(seed)
	except Exception:
	new_seed = 42

	if new_seed == -1:
	new_seed = random.randint(1, 1_000_000)

	final_prompt = str(prompt or "").strip()

	pe_msg = ""
	if enhance:
	final_prompt, pe_msg = prompt_enhance(final_prompt, True)
	print(f"[PE] Enhanced prompt: {final_prompt}")
	print(f"[PE] Message: {pe_msg}")

	try:
	resolution_str = str(resolution).split(" ")[0]
	except Exception:
	resolution_str = "1024x1024"

	width, height = get_resolution(resolution_str)

	# Z-Image-Turbo 通常 8 steps 左右即可。
	safe_steps = max(1, min(int(steps), 100))

	image = generate_image(
	p=current_pipe,
	prompt=final_prompt,
	resolution=resolution_str,
	seed=new_seed,
	guidance_scale=0.0,
	num_inference_steps=safe_steps + 1,
	shift=float(shift),
	)

	image = run_safety_check_if_available(current_pipe, image, width, height)

	old_images = normalize_gallery_items(gallery_images)
	gallery = [image] + old_images
	gallery = gallery[:MAX_GALLERY_HISTORY]

	status = (
	f"Done. DEVICE={DEVICE}, resolution={resolution_str}, "
	f"steps={safe_steps + 1}, seed={new_seed}"
	)
	if pe_msg:
	status += f"\nPrompt Enhance: {pe_msg[:300]}"

	return gallery, str(new_seed), int(new_seed), status

	except gr.Error:
	raise
	except Exception as e:
	print("[Generate] Failed:")
	traceback.print_exc()
	cuda_cleanup()
	raise gr.Error(f"Generation failed: {type(e).__name__}: {e}")


	def update_res_choices(_res_cat):
	if str(_res_cat) in RES_CHOICES:
	res_choices = RES_CHOICES[str(_res_cat)]
	else:
	res_choices = RES_CHOICES["1024"]
	return gr.update(value=res_choices[0], choices=res_choices)


	def get_model_status():
	"""
	页面按钮：检查当前模型状态。
	"""
	if pipe is not None:
	return f"Model loaded. DEVICE={DEVICE}, DTYPE={DTYPE}, MODEL_PATH={MODEL_PATH}"

	if MODEL_LOAD_ERROR:
	return "Model not loaded. Last loading error:\n" + MODEL_LOAD_ERROR[-4000:]

	return (
	"Model not loaded yet. This is normal. "
	"The model will be loaded when you click Generate."
	)


	css = """
	.fillable {
	max-width: 1230px !important;
	}
	.gradio-container {
	max-width: 1280px !important;
	}
	"""


	# ==================== Gradio UI ====================
	with gr.Blocks(title="Z-Image Demo") as demo:
	gr.Markdown(
	"""<div align="center">

	# Z-Image Generation Demo

	ZeroGPU friendly lazy-loading version

	</div>"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	prompt_input = gr.Textbox(
	label="Prompt",
	lines=4,
	placeholder="Enter your prompt here...",
	)

	with gr.Row():
	choices = [int(k) for k in RES_CHOICES.keys()]
	res_cat = gr.Dropdown(
	value=1024,
	choices=choices,
	label="Resolution Category",
	)

	initial_res_choices = RES_CHOICES["1024"]
	resolution = gr.Dropdown(
	value=initial_res_choices[0],
	choices=initial_res_choices,
	label="Width x Height (Ratio)",
	)

	with gr.Row():
	seed = gr.Number(
	label="Seed",
	value=42,
	precision=0,
	)
	random_seed = gr.Checkbox(
	label="Random Seed",
	value=True,
	)

	with gr.Row():
	steps = gr.Slider(
	label="Steps",
	minimum=1,
	maximum=100,
	value=8,
	step=1,
	interactive=True,
	)
	shift = gr.Slider(
	label="Time Shift",
	minimum=1.0,
	maximum=10.0,
	value=3.0,
	step=0.1,
	interactive=True,
	)

	enhance = gr.Checkbox(
	label="Enhance Prompt with DashScope",
	value=False,
	info="Requires DASHSCOPE_API_KEY and openai package. Keep disabled if not needed.",
	)

	with gr.Row():
	generate_btn = gr.Button("Generate", variant="primary")
	status_btn = gr.Button("Model Status")

	status_box = gr.Textbox(
	label="Status / Logs",
	lines=6,
	interactive=False,
	)

	gr.Markdown("### 📝 Example Prompts")
	gr.Examples(
	examples=EXAMPLE_PROMPTS,
	inputs=prompt_input,
	label=None,
	)

	with gr.Column(scale=1):
	output_gallery = gr.Gallery(
	label="Generated Images",
	columns=2,
	rows=2,
	height=600,
	object_fit="contain",
	format="png",
	interactive=False,
	)
	used_seed = gr.Textbox(
	label="Seed Used",
	interactive=False,
	)

	res_cat.change(
	update_res_choices,
	inputs=res_cat,
	outputs=resolution,
	)

	generate_btn.click(
	generate,
	inputs=[
	prompt_input,
	resolution,
	seed,
	steps,
	shift,
	random_seed,
	output_gallery,
	enhance,
	],
	outputs=[
	output_gallery,
	used_seed,
	seed,
	status_box,
	],
	)

	status_btn.click(
	get_model_status,
	inputs=[],
	outputs=[status_box],
	)


	if __name__ == "__main__":
	# 兼容不同 Gradio 版本。
	# 新版本支持 mcp_server，旧版本不支持时自动降级。
	try:
	demo.launch(mcp_server=True)
	except TypeError:
	demo.launch()