Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- # Activate Python virtual environment
- COMFYUI_DIR="/mnt/storage/ComfyUI"
- cd /mnt/storage/Comfy_Venv
- source .venv/bin/activate
- cd "$COMFYUI_DIR"
- # -----------------------------
- # GPU visibility / architecture
- # -----------------------------
- export HSA_FORCE_FINE_GRAIN_PCIE=1
- export HIP_VISIBLE_DEVICES=0
- export ROCR_VISIBLE_DEVICES=0
- export HIP_TARGET="gfx1201"
- export PYTORCH_ROCM_ARCH="gfx1201"
- export TORCH_HIP_ARCH_LIST="gfx1201"
- # -----------------------------
- # Mesa / RADV / debugging
- # -----------------------------
- export MESA_LOADER_DRIVER_OVERRIDE=amdgpu
- export RADV_PERFTEST=aco,nggc,sam
- export AMD_DEBUG=0
- export ROCBLAS_VERBOSE_HIPBLASLT_ERROR=1
- export AMD_SERIALIZE_KERNEL=0
- export PYTORCH_HIP_FREE_MEMORY_THRESHOLD_MB=128
- # -----------------------------
- # Memory / performance tuning
- # -----------------------------
- export PYTORCH_ALLOC_CONF="garbage_collection_threshold:0.6,max_split_size_mb:6144"
- export OMP_NUM_THREADS=12
- export MKL_NUM_THREADS=12
- export NUMEXPR_NUM_THREADS=12
- # Precision and performance
- export TORCH_BLAS_PREFER_HIPBLASLT=0
- export TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS="CK,TRITON,ROCBLAS"
- export TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_SEARCH_SPACE="BEST"
- # -----------------------------
- # ROCm backend fine-tuning
- # -----------------------------
- export HSA_ENABLE_ASYNC_COPY=1
- export HSA_ENABLE_SDMA
- export HSA_ENABLE_SDMA_KERNEL_COPY=1
- export HSA_ENABLE_SDMA_COPY=1
- # -----------------------------
- # MIOpen (AMD DNN library)
- # -----------------------------
- export MIOPEN_FIND_MODE=2
- export MIOPEN_ENABLE_CACHE=1
- export MIOPEN_CONV_WINOGRAD=1
- export MIOPEN_DEBUG_CONV_FFT=0
- export MIOPEN_ENABLE_LOGGING_CMD=0
- export MIOPEN_DEBUG_CONV_IMPLICIT_GEMM=1
- export MIOPEN_USER_DB_PATH="$HOME/.config/miopen"
- export MIOPEN_CUSTOM_CACHE_DIR="$HOME/.config/miopen"
- # -----------------------------
- # Torch / Inductor / Triton settings
- # -----------------------------
- export TORCH_COMPILE=1
- export TORCHINDUCTOR_FORCE_FALLBACK=1
- export TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS=""
- export TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_SEARCH_SPACE=""
- # FlashAttention backends
- export TRITON_USE_ROCM=1
- export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1
- export FLASH_ATTENTION_BACKEND="flash_attn_native"
- export FLASH_ATTENTION_TRITON_AMD_ENABLE="true"
- export TRANSFORMERS_USE_FLASH_ATTENTION=1
- export FLASH_ATTENTION_TRITON_AMD_SEQ_LEN=4096
- export USE_CK=OFF
- # ROCBLAS tuning for gfx1201 (RDNA3)
- export ROCBLAS_TENSILE_LIBPATH="$ROCM_PATH/lib/rocblas"
- export ROCBLAS_INTERNAL_FP16_ALT_IMPL=1
- export ROCBLAS_LAYER=0
- export ROCBLAS_INTERNAL_USE_SUBTENSILE=1
- # -----------------------------
- # Run ComfyUI
- # -----------------------------
- python3 main.py \
- --listen 0.0.0.0 \
- --use-pytorch-cross-attention \
- --normalvram \
- --reserve-vram 1 \
- --fast fp16_accumulation fp8_matrix_mult \
- --disable-smart-memory
Advertisement
Add Comment
Please, Sign In to add comment