Untitled

from calendar import prmonth
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel
import torch

# 1. Load the autoencoder model which will be used to decode the latents into image space.
vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4-fp16", torch_dtype=torch.float16, subfolder="vae", use_auth_token=False)

# 2. Load the tokenizer and text encoder to tokenize and encode the text.
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", revision='0993c71e8ad62658387de2714a69f723ddfffacb')
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", revision='0993c71e8ad62658387de2714a69f723ddfffacb')

# 3. The UNet model for generating the latents.
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4-fp16", torch_dtype=torch.float16, subfolder="unet", use_auth_token=False)

from diffusers import LMSDiscreteScheduler

scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
scheduler_b = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)

# Move models to GPU
torch_device = "cuda"
vae.to(torch_device)
text_encoder.to(torch_device)
unet.to(torch_device)

# Prompt setup
prompt = ["lemon cake."]
prompt_b = ["chocolate cake."]

height = 512                        # default height of Stable Diffusion
width = 512                         # default width of Stable Diffusion

num_inference_steps = 50           # Number of denoising steps

guidance_scale = 7.5                # Scale for classifier-free guidance

generator = torch.manual_seed(3)    # Seed generator to create the inital latent noise

batch_size = len(prompt)

# Text embeddings
text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
text_b_input = tokenizer(prompt_b, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")

text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
text_b_embeddings = text_encoder(text_b_input.input_ids.to(torch_device))[0]

# Unconditional embeddings
max_length = text_input.input_ids.shape[-1]
uncond_input = tokenizer(
    [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
)
uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]

# Concatenated embeddings
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
text_b_embeddings = torch.cat([uncond_embeddings, text_b_embeddings])

# Initial random noise
latents = torch.randn(
    (batch_size, unet.in_channels, height // 8, width // 8),
    generator=generator,
)
latents = latents.to(torch_device)

# Scheduler setup
scheduler.set_timesteps(num_inference_steps)

# The K-LMS scheduler needs to multiply the latents by its sigma values.
latents = latents * scheduler.sigmas[0]
latents_b = latents

from diffusers.models.attention import CrossAttention, AttentionBlock, BasicTransformerBlock
import types

glob_save_att_map = None
glob_use_att_map = None
glob_att_layer_index = 0

def crossattn_attention(self, query, key, value, sequence_length, dim):
    global glob_save_att_map, glob_use_att_map, glob_att_layer_index

    batch_size_attention = query.shape[0]
    hidden_states = torch.zeros(
        (batch_size_attention, sequence_length, dim // self.heads), device=query.device, dtype=query.dtype
    )
    slice_size = self._slice_size if self._slice_size is not None else hidden_states.shape[0]
    for i in range(hidden_states.shape[0] // slice_size):
        start_idx = i * slice_size
        end_idx = (i + 1) * slice_size
        if dim == 1280 or dim == 640:
            if glob_use_att_map is not None:
                attn_slice = glob_use_att_map[glob_att_layer_index]
            else:
                attn_slice = torch.matmul(query[start_idx:end_idx], key[start_idx:end_idx].transpose(1, 2)) * self.scale
                attn_slice = attn_slice.softmax(dim=-1)
                if glob_save_att_map is not None:
                    glob_save_att_map.append(attn_slice)
            glob_att_layer_index = glob_att_layer_index+1
        else:
            attn_slice = torch.matmul(query[start_idx:end_idx], key[start_idx:end_idx].transpose(1, 2)) * self.scale
            attn_slice = attn_slice.softmax(dim=-1)

        attn_slice = torch.matmul(attn_slice, value[start_idx:end_idx])

        hidden_states[start_idx:end_idx] = attn_slice

    # reshape hidden_states
    hidden_states = self.reshape_batch_dim_to_heads(hidden_states)

    return hidden_states

def replace_attention(model):
    for module in model.modules():
        if isinstance(module, BasicTransformerBlock):
            module.attn2._attention = types.MethodType(crossattn_attention, module.attn2)
            module.attn1._attention = types.MethodType(crossattn_attention, module.attn1)

replace_attention(unet)

scheduler.set_timesteps(num_inference_steps)
scheduler_b.set_timesteps(num_inference_steps)

from PIL import Image

def step(i, t, latents, text_embeddings, scheduler, use_attmap=None, save_attmap=False):
    global glob_save_att_map, glob_use_att_map, glob_att_layer_index

    # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
    latent_model_input = torch.cat([latents] * 2)
    sigma = scheduler.sigmas[i]
    latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)

    glob_save_att_map = [] if save_attmap else None

    glob_att_layer_index = 0
    if use_attmap is not None:
        glob_use_att_map = use_attmap
    else:
        glob_use_att_map = None

    # predict the noise residual
    with torch.no_grad():
      noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

    result_attmap = glob_save_att_map
    glob_save_att_map = None

    # perform guidance
    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

    # compute the previous noisy sample x_t -> x_t-1
    latents = scheduler.step(noise_pred, i, latents).prev_sample

    return latents, result_attmap

def decodeImage(latents):
    latents = 1 / 0.18215 * latents
    image = vae.decode(latents).sample

    image = (image / 2 + 0.5).clamp(0, 1)
    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
    images = (image * 255).round().astype("uint8")
    pil_images = [Image.fromarray(image) for image in images]
    return pil_images

def saveImage(latents, suffix):
    pil_images = decodeImage(latents)
    pil_images[0].save(f"C:\\Projects\\SD\\attmap\\{suffix}.png")

from tqdm.auto import tqdm

with torch.autocast("cuda"):
    for i, t in tqdm(enumerate(scheduler.timesteps)):
        latents, attmap = step(i, t, latents, text_embeddings, scheduler, use_attmap=None, save_attmap=True)
        if i < (num_inference_steps * 0.75):
            latents_b, _ = step(i, t, latents_b, text_b_embeddings, scheduler_b, use_attmap=attmap)
        else:
            latents_b, _ = step(i, t, latents_b, text_b_embeddings, scheduler_b)

    saveImage(latents, "a")
    del latents
    saveImage(latents_b, "b")