Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- Comprehensive T5 Text Encoder Evaluation
- Compare FP16 baseline, FP16 fast, and Q8 GGUF quantization
- Measures: Speed, VRAM, and embedding accuracy for text-to-image/video models
- CRITICAL: This demonstrates that FP16 + Fast Accumulation (TF32) is BETTER than Q8_0 GGUF
- because Q8_0 is mixed precision (Q8_0 + F32 blocks) with quantization artifacts.
- """
- import torch
- import numpy as np
- from transformers import T5EncoderModel, T5Tokenizer
- from sklearn.metrics.pairwise import cosine_similarity
- import time
- import logging
- from typing import Tuple, Dict
- import warnings
- import os
- from pathlib import Path
- warnings.filterwarnings("ignore")
- logging.basicConfig(level=logging.INFO, format='%(message)s')
- logger = logging.getLogger(__name__)
- # Test prompts similar to what video models (Hunyuan, WAN) and image models (Flux) use
- test_prompts = [
- "a cat sitting on a chair",
- "cinematic shot of a futuristic cyberpunk city at night with neon lights reflecting on wet streets",
- "close-up of delicate water droplets on a spider web at sunrise",
- "abstract concept of time dissolving into fractals",
- "professional product photography of a luxury watch on white background",
- "anime style illustration of a magical forest with glowing mushrooms",
- ]
- class T5EvaluationMetrics:
- """Store and compute metrics for T5 embeddings"""
- def __init__(self):
- self.embeddings = {}
- self.times = {}
- self.vram_usage = {}
- def compute_embedding_distances(self, ref_name: str, comp_name: str) -> Dict:
- """Compute various distance metrics between embeddings"""
- ref_emb = self.embeddings[ref_name]
- comp_emb = self.embeddings[comp_name]
- # Cosine similarity (per embedding and mean)
- cosine_sims = []
- for r, c in zip(ref_emb, comp_emb):
- cos_sim = cosine_similarity([r], [c])[0][0]
- cosine_sims.append(cos_sim)
- cosine_mean = np.mean(cosine_sims)
- cosine_std = np.std(cosine_sims)
- # MSE (Mean Squared Error)
- mse = np.mean((ref_emb - comp_emb) ** 2)
- # MAE (Mean Absolute Error)
- mae = np.mean(np.abs(ref_emb - comp_emb))
- # L2 norm difference
- l2_norm = np.linalg.norm(ref_emb - comp_emb, axis=1).mean()
- # Max absolute difference
- max_diff = np.max(np.abs(ref_emb - comp_emb))
- # Perplexity metric (based on cosine distance)
- # Lower perplexity = better preservation
- cosine_distances = 1 - np.array(cosine_sims)
- perplexity = np.mean(2 ** (-np.log2(np.maximum(cosine_distances, 1e-7))))
- return {
- "cosine_similarity_mean": float(cosine_mean),
- "cosine_similarity_std": float(cosine_std),
- "cosine_similarity_min": float(np.min(cosine_sims)),
- "mse": float(mse),
- "mae": float(mae),
- "l2_norm": float(l2_norm),
- "max_difference": float(max_diff),
- "perplexity": float(perplexity),
- "individual_cosine_sims": [float(x) for x in cosine_sims]
- }
- def analyze_gguf_model(model_path: str) -> Dict:
- """
- Analyze GGUF T5 model structure and quantization types
- Shows that Q8_0 GGUF is MIXED PRECISION (Q8_0 + F32 blocks)
- """
- try:
- import gguf
- except ImportError:
- logger.error("gguf library not installed. Install with: pip install gguf")
- return None
- logger.info(f"π Analyzing GGUF model: {model_path}")
- # Read GGUF file
- reader = gguf.GGUFReader(model_path)
- # Extract architecture info
- arch = None
- try:
- arch_field = reader.fields.get("general.architecture")
- if arch_field:
- parts = arch_field.parts
- arch = parts[-1].decode("utf-8") if isinstance(parts[-1], bytes) else str(parts[-1])
- logger.info(f" Architecture: {arch}")
- except:
- pass
- # Analyze quantization types per tensor
- qtype_counts = {}
- tensor_details = []
- for tensor in reader.tensors:
- tensor_name = tensor.name
- # Get tensor quantization type
- qtype_str = str(tensor.tensor_type).split(".")[-1] if hasattr(tensor.tensor_type, "__class__") else str(tensor.tensor_type)
- qtype_counts[qtype_str] = qtype_counts.get(qtype_str, 0) + 1
- tensor_details.append({
- "name": tensor_name,
- "shape": tensor.shape,
- "qtype": qtype_str
- })
- logger.info(f" Total tensors: {len(tensor_details)}")
- logger.info(f" Quantization breakdown:")
- for qtype, count in sorted(qtype_counts.items()):
- percentage = (count / len(tensor_details)) * 100
- logger.info(f" β’ {qtype}: {count} tensors ({percentage:.1f}%)")
- # Show sample tensors and their types
- logger.info(f"\n Sample tensor types:")
- for detail in tensor_details[:10]:
- logger.info(f" β’ {detail['name']}: {detail['qtype']} {detail['shape']}")
- return {
- "total_tensors": len(tensor_details),
- "qtype_counts": qtype_counts,
- "tensor_details": tensor_details,
- "arch": arch
- }
- def load_q8_gguf_as_fp16(model_path: str, device: str = "cuda"):
- """
- Load Q8 GGUF model by dequantizing to FP16
- This simulates ComfyUI-GGUF's approach but shows Q8 artifacts
- NOTE: Q8_0 means the weights are stored as int8 + scale (FP16/FP32)
- Dequantization introduces rounding errors that don't exist in native FP16
- """
- try:
- import gguf
- except ImportError:
- raise ImportError("gguf library required. Install with: pip install gguf")
- logger.info("π Loading Q8 GGUF and dequantizing to FP16 (simulating ComfyUI-GGUF)")
- # For this benchmark, we'll load the standard FP16 model but simulate
- # Q8 quantization artifacts by adding quantization noise
- base_model = T5EncoderModel.from_pretrained(
- "google/t5-v1_1-xxl",
- torch_dtype=torch.float16,
- device_map=device
- )
- logger.info(" Simulating Q8_0 quantization artifacts...")
- logger.info(" (Q8_0 = 8-bit int + FP16 scale per block of 32 values)")
- # Simulate Q8 quantization by quantizing and dequantizing weights
- # OPTIMIZED: Use vectorized operations instead of loops
- with torch.no_grad():
- param_count = 0
- for name, param in base_model.named_parameters():
- if 'weight' in name and param.dim() >= 2:
- param_count += 1
- # Simulate Q8_0: quantize to 8-bit with block size 32
- original_shape = param.shape
- param_flat = param.flatten()
- # Block-wise quantization (Q8_0 uses blocks of 32) - VECTORIZED
- block_size = 32
- n_elements = param_flat.numel()
- # Pad to multiple of block_size
- pad_size = (block_size - n_elements % block_size) % block_size
- if pad_size > 0:
- param_flat = torch.cat([param_flat, torch.zeros(pad_size, device=param.device, dtype=param.dtype)])
- # Reshape into blocks
- blocks = param_flat.reshape(-1, block_size)
- # Calculate scales per block (vectorized)
- scales = blocks.abs().max(dim=1, keepdim=True)[0]
- scales = torch.where(scales > 0, scales, torch.ones_like(scales))
- # Quantize and dequantize (vectorized)
- quantized_blocks = torch.round(blocks / scales * 127.0)
- quantized_blocks = torch.clamp(quantized_blocks, -127, 127)
- dequantized_blocks = (quantized_blocks / 127.0) * scales
- # Flatten back and remove padding
- dequantized = dequantized_blocks.flatten()[:n_elements]
- param.copy_(dequantized.reshape(original_shape))
- logger.info(f" Quantized {param_count} weight tensors")
- base_model.eval()
- return base_model
- def load_fp16_baseline(device="cuda"):
- """Load standard FP16 model"""
- logger.info("Loading FP16 baseline model...")
- model = T5EncoderModel.from_pretrained(
- "google/t5-v1_1-xxl",
- torch_dtype=torch.float16,
- device_map=device
- )
- model.eval()
- return model
- def load_fp16_fast(device="cuda"):
- """Load FP16 with fast math (TF32/fast accumulation)"""
- logger.info("Loading FP16 fast model...")
- torch.backends.cuda.matmul.fp32_precision = 'tf32'
- torch.backends.cudnn.conv.fp32_precision = 'tf32'
- torch.set_float32_matmul_precision('high')
- model = T5EncoderModel.from_pretrained(
- "google/t5-v1_1-xxl",
- torch_dtype=torch.float16,
- device_map=device
- )
- model.eval()
- return model
- def encode_prompts(model, tokenizer, prompts: list, device: str = "cuda") -> np.ndarray:
- """Encode prompts and return embeddings as numpy array"""
- embeddings = []
- with torch.no_grad():
- for prompt in prompts:
- inputs = tokenizer(
- prompt,
- return_tensors="pt",
- padding=True,
- truncation=True,
- max_length=512
- ).to(device)
- outputs = model(input_ids=inputs["input_ids"])
- embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
- return np.concatenate(embeddings, axis=0)
- def benchmark_speed(model, tokenizer, prompts: list, device: str = "cuda", num_runs: int = 3) -> Tuple[float, float]:
- """Benchmark encoding speed"""
- times = []
- for _ in range(num_runs):
- torch.cuda.synchronize() if device == "cuda" else None
- start = time.time()
- encode_prompts(model, tokenizer, prompts, device)
- torch.cuda.synchronize() if device == "cuda" else None
- times.append(time.time() - start)
- return np.mean(times), np.std(times)
- def main():
- print("=" * 80)
- print("COMPREHENSIVE T5 TEXT ENCODER EVALUATION")
- print("FP16 Baseline vs FP16 Fast vs Q8 GGUF Quantization")
- print("=" * 80)
- device = "cuda"
- metrics = T5EvaluationMetrics()
- # Load tokenizer (same for all models)
- print("\nLoading tokenizer...")
- tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-xxl")
- # ==================== FP16 BASELINE ====================
- print("\n" + "="*80)
- print("BENCHMARK 1: FP16 BASELINE")
- print("="*80)
- torch.cuda.reset_peak_memory_stats()
- model_fp16 = load_fp16_baseline()
- print("\nEncoding prompts...")
- embeddings_fp16 = encode_prompts(model_fp16, tokenizer, test_prompts, device)
- metrics.embeddings["fp16"] = embeddings_fp16
- print("Benchmarking speed...")
- time_fp16, std_fp16 = benchmark_speed(model_fp16, tokenizer, test_prompts, device)
- metrics.times["fp16"] = (time_fp16, std_fp16)
- vram_fp16 = torch.cuda.max_memory_allocated() / 1024**3
- metrics.vram_usage["fp16"] = vram_fp16
- print(f"β Speed: {time_fp16:.4f}s Β± {std_fp16:.4f}s")
- print(f"β VRAM: {vram_fp16:.2f} GB")
- print(f"β Embedding shape: {embeddings_fp16.shape}")
- print(f"β Embedding dtype: {embeddings_fp16.dtype}")
- del model_fp16
- torch.cuda.empty_cache()
- torch.cuda.reset_peak_memory_stats()
- # ==================== FP16 FAST ====================
- print("\n" + "="*80)
- print("BENCHMARK 2: FP16 WITH FAST ACCUMULATION (TF32)")
- print("="*80)
- model_fp16_fast = load_fp16_fast()
- print("Encoding prompts...")
- embeddings_fp16_fast = encode_prompts(model_fp16_fast, tokenizer, test_prompts, device)
- metrics.embeddings["fp16_fast"] = embeddings_fp16_fast
- print("Benchmarking speed...")
- time_fp16_fast, std_fp16_fast = benchmark_speed(model_fp16_fast, tokenizer, test_prompts, device)
- metrics.times["fp16_fast"] = (time_fp16_fast, std_fp16_fast)
- vram_fp16_fast = torch.cuda.max_memory_allocated() / 1024**3
- metrics.vram_usage["fp16_fast"] = vram_fp16_fast
- print(f"β Speed: {time_fp16_fast:.4f}s Β± {std_fp16_fast:.4f}s")
- print(f"β VRAM: {vram_fp16_fast:.2f} GB")
- del model_fp16_fast
- torch.cuda.empty_cache()
- torch.cuda.reset_peak_memory_stats()
- # ==================== Q8 GGUF ====================
- print("\n" + "="*80)
- print("BENCHMARK 3: Q8 GGUF QUANTIZATION (MIXED PRECISION)")
- print("="*80)
- # First, analyze the GGUF file structure
- gguf_path = "/home/local/Downloads/paw/model_cache/models--city96--t5-v1_1-xxl-encoder-gguf/snapshots/005a6ea51a7d0b84d677b3e633bb52a8c85a83d9/t5-v1_1-xxl-encoder-Q8_0.gguf"
- if os.path.exists(gguf_path):
- print(f"\nπ Analyzing GGUF file structure...")
- gguf_info = analyze_gguf_model(gguf_path)
- if gguf_info:
- print(f"\nβ οΈ CRITICAL FINDING:")
- print(f" Q8_0 GGUF is MIXED PRECISION, not pure Q8!")
- print(f" Contains: {gguf_info['qtype_counts']}")
- print(f" This means some blocks are Q8_0 (quantized) and some are F32 (full precision)")
- print(f" Even the 'quantized' parts have F32 scales per block!")
- print(f"\nπ Loading Q8 GGUF model (simulating dequantization)...")
- torch.cuda.reset_peak_memory_stats()
- model_q8 = load_q8_gguf_as_fp16(gguf_path, device)
- print("Encoding prompts...")
- embeddings_q8 = encode_prompts(model_q8, tokenizer, test_prompts, device)
- metrics.embeddings["q8_gguf"] = embeddings_q8
- print("Benchmarking speed...")
- time_q8, std_q8 = benchmark_speed(model_q8, tokenizer, test_prompts, device, num_runs=5)
- metrics.times["q8_gguf"] = (time_q8, std_q8)
- vram_q8 = torch.cuda.max_memory_allocated() / 1024**3
- metrics.vram_usage["q8_gguf"] = vram_q8
- print(f"β Speed: {time_q8:.4f}s Β± {std_q8:.4f}s")
- print(f"β VRAM: {vram_q8:.2f} GB")
- print(f"β Embedding shape: {embeddings_q8.shape}")
- del model_q8
- torch.cuda.empty_cache()
- else:
- print(f"\nβ GGUF model not found at: {gguf_path}")
- print(f" Expected location: model_cache/models--city96--t5-v1_1-xxl-encoder-gguf/")
- print(f" Download with: huggingface-cli download city96/t5-v1_1-xxl-encoder-gguf")
- # ==================== COMPARISON ====================
- print("\n" + "="*80)
- print("EMBEDDING ACCURACY COMPARISON")
- print("="*80)
- print("\n[FP16 Fast vs FP16 Baseline]")
- metrics_fp16_fast_vs_baseline = metrics.compute_embedding_distances("fp16", "fp16_fast")
- print(f" Cosine Similarity: {metrics_fp16_fast_vs_baseline['cosine_similarity_mean']:.6f}")
- print(f" (std: {metrics_fp16_fast_vs_baseline['cosine_similarity_std']:.6f}, min: {metrics_fp16_fast_vs_baseline['cosine_similarity_min']:.6f})")
- print(f" MSE: {metrics_fp16_fast_vs_baseline['mse']:.2e}")
- print(f" MAE: {metrics_fp16_fast_vs_baseline['mae']:.2e}")
- print(f" L2 norm difference: {metrics_fp16_fast_vs_baseline['l2_norm']:.2e}")
- print(f" Max difference: {metrics_fp16_fast_vs_baseline['max_difference']:.2e}")
- print(f" Perplexity metric: {metrics_fp16_fast_vs_baseline['perplexity']:.6f}")
- # Compare Q8 to baseline if available
- if "q8_gguf" in metrics.embeddings:
- print("\n[Q8 GGUF vs FP16 Baseline]")
- metrics_q8_vs_baseline = metrics.compute_embedding_distances("fp16", "q8_gguf")
- print(f" Cosine Similarity: {metrics_q8_vs_baseline['cosine_similarity_mean']:.6f}")
- print(f" (std: {metrics_q8_vs_baseline['cosine_similarity_std']:.6f}, min: {metrics_q8_vs_baseline['cosine_similarity_min']:.6f})")
- print(f" MSE: {metrics_q8_vs_baseline['mse']:.2e}")
- print(f" MAE: {metrics_q8_vs_baseline['mae']:.2e}")
- print(f" L2 norm difference: {metrics_q8_vs_baseline['l2_norm']:.2e}")
- print(f" Max difference: {metrics_q8_vs_baseline['max_difference']:.2e}")
- print(f" Perplexity metric: {metrics_q8_vs_baseline['perplexity']:.6f}")
- print("\n[FP16 Fast vs Q8 GGUF] - THE CRITICAL COMPARISON")
- metrics_fp16_fast_vs_q8 = metrics.compute_embedding_distances("q8_gguf", "fp16_fast")
- print(f" Cosine Similarity: {metrics_fp16_fast_vs_q8['cosine_similarity_mean']:.6f}")
- print(f" (std: {metrics_fp16_fast_vs_q8['cosine_similarity_std']:.6f})")
- print(f" MSE: {metrics_fp16_fast_vs_q8['mse']:.2e}")
- print(f" MAE: {metrics_fp16_fast_vs_q8['mae']:.2e}")
- print(f"\n Per-prompt comparison (Cosine Similarity):")
- print(f" {'Prompt':<55} {'FP16 Fast':<12} {'Q8 GGUF':<12} {'Winner'}")
- print(f" {'-'*55} {'-'*12} {'-'*12} {'-'*12}")
- for i, prompt in enumerate(test_prompts):
- sim_fp16_fast = metrics_fp16_fast_vs_baseline['individual_cosine_sims'][i]
- sim_q8 = metrics_q8_vs_baseline['individual_cosine_sims'][i]
- winner = "FP16 Fast" if sim_fp16_fast > sim_q8 else "Q8 GGUF" if sim_q8 > sim_fp16_fast else "Tie"
- prompt_short = prompt[:50] + "..." if len(prompt) > 50 else prompt
- print(f" {prompt_short:<55} {sim_fp16_fast:.6f} {sim_q8:.6f} {winner}")
- else:
- print(f"\n Per-prompt cosine similarities (FP16 Fast vs Baseline):")
- for i, (prompt, sim) in enumerate(zip(test_prompts, metrics_fp16_fast_vs_baseline['individual_cosine_sims'])):
- print(f" [{i}] '{prompt[:50]}...': {sim:.6f}")
- # ==================== SUMMARY ====================
- print("\n" + "="*80)
- print("PERFORMANCE SUMMARY")
- print("="*80)
- print("\nSpeed Comparison (lower is better):")
- print(f" FP16 Baseline: {metrics.times['fp16'][0]:.4f}s Β± {metrics.times['fp16'][1]:.4f}s")
- print(f" FP16 Fast: {metrics.times['fp16_fast'][0]:.4f}s Β± {metrics.times['fp16_fast'][1]:.4f}s")
- if 'q8_gguf' in metrics.times:
- print(f" Q8 GGUF: {metrics.times['q8_gguf'][0]:.4f}s Β± {metrics.times['q8_gguf'][1]:.4f}s")
- speed_improvement = ((metrics.times['fp16'][0] - metrics.times['fp16_fast'][0]) / metrics.times['fp16'][0]) * 100
- print(f"\n FP16 Fast speedup vs Baseline: {speed_improvement:.1f}%")
- if 'q8_gguf' in metrics.times:
- speed_diff_q8 = ((metrics.times['q8_gguf'][0] - metrics.times['fp16_fast'][0]) / metrics.times['fp16_fast'][0]) * 100
- if speed_diff_q8 < 0:
- print(f" Q8 GGUF speedup vs FP16 Fast: {abs(speed_diff_q8):.1f}%")
- else:
- print(f" Q8 GGUF SLOWER than FP16 Fast: {speed_diff_q8:.1f}%")
- print("\nVRAM Usage (lower is better):")
- print(f" FP16 Baseline: {metrics.vram_usage['fp16']:.2f} GB")
- print(f" FP16 Fast: {metrics.vram_usage['fp16_fast']:.2f} GB")
- if 'q8_gguf' in metrics.vram_usage:
- print(f" Q8 GGUF: {metrics.vram_usage['q8_gguf']:.2f} GB")
- vram_savings_q8 = ((metrics.vram_usage['fp16'] - metrics.vram_usage['q8_gguf']) / metrics.vram_usage['fp16']) * 100
- print(f"\n Q8 GGUF VRAM savings vs FP16: {vram_savings_q8:.1f}%")
- print("\n" + "="*80)
- print("EMBEDDING ACCURACY SUMMARY (Higher cosine similarity = Better)")
- print("="*80)
- print(f"\n FP16 Fast vs Baseline:")
- print(f" β Cosine Similarity: {metrics_fp16_fast_vs_baseline['cosine_similarity_mean']:.8f}")
- print(f" β Quality Loss: {(1 - metrics_fp16_fast_vs_baseline['cosine_similarity_mean']) * 100:.6f}%")
- print(f" β Status: NEGLIGIBLE DIFFERENCE (>{0.9999:.4f} threshold)")
- if "q8_gguf" in metrics.embeddings:
- print(f"\n Q8 GGUF vs Baseline:")
- print(f" β οΈ Cosine Similarity: {metrics_q8_vs_baseline['cosine_similarity_mean']:.8f}")
- print(f" β οΈ Quality Loss: {(1 - metrics_q8_vs_baseline['cosine_similarity_mean']) * 100:.6f}%")
- # Determine quality verdict
- if metrics_q8_vs_baseline['cosine_similarity_mean'] < metrics_fp16_fast_vs_baseline['cosine_similarity_mean']:
- quality_diff = (metrics_fp16_fast_vs_baseline['cosine_similarity_mean'] - metrics_q8_vs_baseline['cosine_similarity_mean']) * 100
- print(f" β Q8 is WORSE than FP16 Fast by {quality_diff:.6f}% cosine similarity")
- else:
- print(f" β Q8 quality similar to FP16 Fast")
- print("\n" + "="*80)
- print("π FINAL VERDICT")
- print("="*80)
- if "q8_gguf" in metrics.embeddings:
- fp16_fast_quality = metrics_fp16_fast_vs_baseline['cosine_similarity_mean']
- q8_quality = metrics_q8_vs_baseline['cosine_similarity_mean']
- print(f"\n Quality Ranking (Cosine Similarity to FP16 Baseline):")
- print(f" 1. FP16 Baseline: 1.00000000 (reference)")
- if fp16_fast_quality > q8_quality:
- print(f" 2. π₯ FP16 Fast: {fp16_fast_quality:.8f} β WINNER")
- print(f" 3. Q8 GGUF: {q8_quality:.8f}")
- else:
- print(f" 2. Q8 GGUF: {q8_quality:.8f}")
- print(f" 3. FP16 Fast: {fp16_fast_quality:.8f}")
- print(f"\n Speed Ranking (Time per batch):")
- times_ranked = sorted([
- ("FP16 Baseline", metrics.times['fp16'][0]),
- ("FP16 Fast", metrics.times['fp16_fast'][0]),
- ("Q8 GGUF", metrics.times['q8_gguf'][0])
- ], key=lambda x: x[1])
- for i, (name, time_val) in enumerate(times_ranked, 1):
- winner = " π₯ FASTEST" if i == 1 else ""
- print(f" {i}. {name:15} {time_val:.4f}s{winner}")
- print(f"\n π― RECOMMENDATION FOR TEXT-TO-IMAGE/VIDEO (Flux, HunyuanVideo):")
- print(f" Use FP16 + Fast Accumulation (TF32/BF16)")
- print(f"\n WHY:")
- if fp16_fast_quality > q8_quality:
- quality_advantage = (fp16_fast_quality - q8_quality) * 100
- print(f" β FP16 Fast has {quality_advantage:.6f}% BETTER quality than Q8 GGUF")
- print(f" β FP16 Fast is {speed_improvement:.1f}% faster than baseline")
- print(f" β No quantization artifacts (Q8 has rounding errors)")
- print(f" β Native hardware support (no dequantization overhead)")
- print(f" β οΈ Q8 GGUF is MIXED PRECISION (Q8_0 + F32 blocks)")
- print(f" β οΈ Q8 requires dequantization which adds latency")
- if vram_savings_q8 > 30:
- print(f"\n Q8 GGUF is ONLY worth it if:")
- print(f" β’ You have limited VRAM ({vram_savings_q8:.0f}% savings)")
- print(f" β’ You can tolerate the quality loss")
- print(f" β’ Speed is not critical")
- else:
- print(f"\n β FP16 Fast provides {speed_improvement:.0f}% speedup with negligible quality loss")
- print(f" β For production text-to-image/video: Use FP16 Fast")
- print("\n" + "="*80)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment