Advertisement
Guest User

exported llama on make-cache-traceable

a guest
Jan 24th, 2025
37
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 53.93 KB | None | 0 0
  1. ExportedProgram:
  2. class GraphModule(torch.nn.Module):
  3. def forward(self, p_model_model_layers_slice_none__2__none____modules__0___input_layernorm_weight: "bf16[16]", p_model_model_layers_slice_none__2__none____modules__0___post_attention_layernorm_weight: "bf16[16]", p_model_model_layers_slice_none__2__none____modules__1___input_layernorm_weight: "bf16[16]", p_model_model_layers_slice_none__2__none____modules__1___post_attention_layernorm_weight: "bf16[16]", p_model_model_norm_weight: "bf16[16]", p_model_model_embed_tokens_weight: "bf16[32000, 16]", p_model_model_layers_slice_none__2__none____modules__0___self_attn_q_proj_weight: "bf16[16, 16]", p_model_model_layers_slice_none__2__none____modules__0___self_attn_k_proj_weight: "bf16[16, 16]", p_model_model_layers_slice_none__2__none____modules__0___self_attn_v_proj_weight: "bf16[16, 16]", p_model_model_layers_slice_none__2__none____modules__0___self_attn_o_proj_weight: "bf16[16, 16]", p_model_model_layers_slice_none__2__none____modules__0___mlp_gate_proj_weight: "bf16[64, 16]", p_model_model_layers_slice_none__2__none____modules__0___mlp_up_proj_weight: "bf16[64, 16]", p_model_model_layers_slice_none__2__none____modules__0___mlp_down_proj_weight: "bf16[16, 64]", p_model_model_layers_slice_none__2__none____modules__1___self_attn_q_proj_weight: "bf16[16, 16]", p_model_model_layers_slice_none__2__none____modules__1___self_attn_k_proj_weight: "bf16[16, 16]", p_model_model_layers_slice_none__2__none____modules__1___self_attn_v_proj_weight: "bf16[16, 16]", p_model_model_layers_slice_none__2__none____modules__1___self_attn_o_proj_weight: "bf16[16, 16]", p_model_model_layers_slice_none__2__none____modules__1___mlp_gate_proj_weight: "bf16[64, 16]", p_model_model_layers_slice_none__2__none____modules__1___mlp_up_proj_weight: "bf16[64, 16]", p_model_model_layers_slice_none__2__none____modules__1___mlp_down_proj_weight: "bf16[16, 64]", p_model_lm_head_weight: "bf16[32000, 16]", b_mask: "b8[1234, 1234]", b___static_cache_key_cache_0: "bf16[1, 4, 1234, 4]", b_model_model_rotary_emb_inv_freq: "f32[2]", b___static_cache_value_cache_0: "bf16[1, 4, 1234, 4]", b___static_cache_key_cache_1: "bf16[1, 4, 1234, 4]", b___static_cache_value_cache_1: "bf16[1, 4, 1234, 4]", input_ids: "i64[1, 1]", cache_position: "i64[1]"):
  4. # File: /home/ilyas/transformers/src/transformers/integrations/executorch.py:112 in forward, code: attn_mask = self.mask[cache_position, :seqlen] if self.is_causal else None
  5. slice_1: "b8[1234, 1]" = torch.ops.aten.slice.Tensor(b_mask, 1, 0, 1); b_mask = None
  6. index: "b8[1, 1]" = torch.ops.aten.index.Tensor(slice_1, [cache_position]); slice_1 = None
  7.  
  8. # File: /home/ilyas/transformers/src/transformers/integrations/executorch.py:113 in forward, code: position_ids = cache_position.unsqueeze(0)
  9. unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(cache_position, 0)
  10.  
  11. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:550 in forward, code: inputs_embeds = self.embed_tokens(input_ids)
  12. embedding: "bf16[1, 1, 16]" = torch.ops.aten.embedding.default(p_model_model_embed_tokens_weight, input_ids, 31999); p_model_model_embed_tokens_weight = input_ids = None
  13.  
  14. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:728 in _prepare_4d_causal_attention_mask_with_cache_position, code: causal_mask = torch.full(
  15. full: "bf16[1, 1234]" = torch.ops.aten.full.default([1, 1234], -3.3895313892515355e+38, dtype = torch.bfloat16, device = device(type='cpu'), pin_memory = False)
  16.  
  17. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:733 in _prepare_4d_causal_attention_mask_with_cache_position, code: causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
  18. arange: "i64[1234]" = torch.ops.aten.arange.default(1234, device = device(type='cpu'), pin_memory = False)
  19. view: "i64[1, 1]" = torch.ops.aten.view.default(cache_position, [-1, 1])
  20. gt: "b8[1, 1234]" = torch.ops.aten.gt.Tensor(arange, view); arange = view = None
  21. mul: "bf16[1, 1234]" = torch.ops.aten.mul.Tensor(full, gt); full = gt = None
  22.  
  23. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:736 in _prepare_4d_causal_attention_mask_with_cache_position, code: causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
  24. unsqueeze_3: "bf16[1, 1, 1234]" = torch.ops.aten.unsqueeze.default(mul, 0); mul = None
  25. unsqueeze_4: "bf16[1, 1, 1, 1234]" = torch.ops.aten.unsqueeze.default(unsqueeze_3, 1); unsqueeze_3 = None
  26. slice_4: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(unsqueeze_4, 2, 0, 9223372036854775807); unsqueeze_4 = None
  27. slice_5: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(slice_4, 3, 0, 9223372036854775807); slice_4 = None
  28. expand_1: "bf16[1, 1, 1, 1234]" = torch.ops.aten.expand.default(slice_5, [1, 1, -1, -1]); slice_5 = None
  29. clone: "bf16[1, 1, 1, 1234]" = torch.ops.aten.clone.default(expand_1); expand_1 = None
  30.  
  31. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:738 in _prepare_4d_causal_attention_mask_with_cache_position, code: padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
  32. slice_6: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
  33. slice_7: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(slice_6, 1, 0, 9223372036854775807); slice_6 = None
  34. slice_8: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(slice_7, 2, 0, 9223372036854775807); slice_7 = None
  35. slice_9: "bf16[1, 1, 1, 1]" = torch.ops.aten.slice.Tensor(slice_8, 3, 0, 1); slice_8 = None
  36. slice_10: "b8[1, 1]" = torch.ops.aten.slice.Tensor(index, 0, 0, 9223372036854775807); index = None
  37. unsqueeze_5: "b8[1, 1, 1]" = torch.ops.aten.unsqueeze.default(slice_10, 1); slice_10 = None
  38. unsqueeze_6: "b8[1, 1, 1, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_5, 2); unsqueeze_5 = None
  39. slice_11: "b8[1, 1, 1, 1]" = torch.ops.aten.slice.Tensor(unsqueeze_6, 3, 0, 9223372036854775807); unsqueeze_6 = None
  40. add: "bf16[1, 1, 1, 1]" = torch.ops.aten.add.Tensor(slice_9, slice_11); slice_9 = slice_11 = None
  41.  
  42. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:739 in _prepare_4d_causal_attention_mask_with_cache_position, code: padding_mask = padding_mask == 0
  43. eq: "b8[1, 1, 1, 1]" = torch.ops.aten.eq.Scalar(add, 0); add = None
  44.  
  45. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:740 in _prepare_4d_causal_attention_mask_with_cache_position, code: causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
  46. slice_12: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
  47. slice_13: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(slice_12, 1, 0, 9223372036854775807); slice_12 = None
  48. slice_14: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(slice_13, 2, 0, 9223372036854775807); slice_13 = None
  49. slice_15: "bf16[1, 1, 1, 1]" = torch.ops.aten.slice.Tensor(slice_14, 3, 0, 1); slice_14 = None
  50. masked_fill: "bf16[1, 1, 1, 1]" = torch.ops.aten.masked_fill.Scalar(slice_15, eq, -3.3895313892515355e+38); slice_15 = eq = None
  51. slice_16: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
  52. slice_17: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(slice_16, 1, 0, 9223372036854775807); slice_16 = None
  53. slice_18: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(slice_17, 2, 0, 9223372036854775807); slice_17 = None
  54. slice_19: "bf16[1, 1, 1, 1]" = torch.ops.aten.slice.Tensor(slice_18, 3, 0, 1); slice_18 = None
  55. copy: "bf16[1, 1, 1, 1]" = torch.ops.aten.copy.default(slice_19, masked_fill); slice_19 = masked_fill = None
  56. slice_20: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
  57. slice_21: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(slice_20, 1, 0, 9223372036854775807)
  58. slice_22: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(slice_21, 2, 0, 9223372036854775807)
  59. slice_scatter: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice_scatter.default(slice_22, copy, 3, 0, 1); slice_22 = copy = None
  60. slice_scatter_1: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice_scatter.default(slice_21, slice_scatter, 2, 0, 9223372036854775807); slice_21 = slice_scatter = None
  61. slice_scatter_2: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice_scatter.default(slice_20, slice_scatter_1, 1, 0, 9223372036854775807); slice_20 = slice_scatter_1 = None
  62. slice_scatter_3: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice_scatter.default(clone, slice_scatter_2, 0, 0, 9223372036854775807); clone = slice_scatter_2 = None
  63.  
  64. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:126 in forward, code: inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
  65. unsqueeze_7: "f32[1, 2]" = torch.ops.aten.unsqueeze.default(b_model_model_rotary_emb_inv_freq, 0); b_model_model_rotary_emb_inv_freq = None
  66. slice_27: "f32[1, 2]" = torch.ops.aten.slice.Tensor(unsqueeze_7, 1, 0, 9223372036854775807); unsqueeze_7 = None
  67. unsqueeze_8: "f32[1, 2, 1]" = torch.ops.aten.unsqueeze.default(slice_27, 2); slice_27 = None
  68. _to_copy: "f32[1, 2, 1]" = torch.ops.aten._to_copy.default(unsqueeze_8, dtype = torch.float32); unsqueeze_8 = None
  69. expand_2: "f32[1, 2, 1]" = torch.ops.aten.expand.default(_to_copy, [1, -1, 1]); _to_copy = None
  70.  
  71. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:127 in forward, code: position_ids_expanded = position_ids[:, None, :].float()
  72. slice_28: "i64[1, 1]" = torch.ops.aten.slice.Tensor(unsqueeze, 0, 0, 9223372036854775807); unsqueeze = None
  73. unsqueeze_9: "i64[1, 1, 1]" = torch.ops.aten.unsqueeze.default(slice_28, 1); slice_28 = None
  74. slice_29: "i64[1, 1, 1]" = torch.ops.aten.slice.Tensor(unsqueeze_9, 2, 0, 9223372036854775807); unsqueeze_9 = None
  75. _to_copy_1: "f32[1, 1, 1]" = torch.ops.aten._to_copy.default(slice_29, dtype = torch.float32); slice_29 = None
  76.  
  77. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:132 in forward, code: freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
  78. _to_copy_2: "f32[1, 2, 1]" = torch.ops.aten._to_copy.default(expand_2, dtype = torch.float32); expand_2 = None
  79. _to_copy_3: "f32[1, 1, 1]" = torch.ops.aten._to_copy.default(_to_copy_1, dtype = torch.float32); _to_copy_1 = None
  80. expand_3: "f32[1, 2, 1]" = torch.ops.aten.expand.default(_to_copy_2, [1, 2, 1]); _to_copy_2 = None
  81. view_1: "f32[1, 2, 1]" = torch.ops.aten.view.default(expand_3, [1, 2, 1]); expand_3 = None
  82. expand_4: "f32[1, 1, 1]" = torch.ops.aten.expand.default(_to_copy_3, [1, 1, 1]); _to_copy_3 = None
  83. view_2: "f32[1, 1, 1]" = torch.ops.aten.view.default(expand_4, [1, 1, 1]); expand_4 = None
  84. bmm: "f32[1, 2, 1]" = torch.ops.aten.bmm.default(view_1, view_2); view_1 = view_2 = None
  85. view_3: "f32[1, 2, 1]" = torch.ops.aten.view.default(bmm, [1, 2, 1]); bmm = None
  86. transpose: "f32[1, 1, 2]" = torch.ops.aten.transpose.int(view_3, 1, 2); view_3 = None
  87.  
  88. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:133 in forward, code: emb = torch.cat((freqs, freqs), dim=-1)
  89. cat: "f32[1, 1, 4]" = torch.ops.aten.cat.default([transpose, transpose], -1); transpose = None
  90.  
  91. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:134 in forward, code: cos = emb.cos()
  92. cos: "f32[1, 1, 4]" = torch.ops.aten.cos.default(cat)
  93.  
  94. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:135 in forward, code: sin = emb.sin()
  95. sin: "f32[1, 1, 4]" = torch.ops.aten.sin.default(cat); cat = None
  96.  
  97. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:138 in forward, code: cos = cos * self.attention_scaling
  98. mul_1: "f32[1, 1, 4]" = torch.ops.aten.mul.Tensor(cos, 1.0); cos = None
  99.  
  100. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:139 in forward, code: sin = sin * self.attention_scaling
  101. mul_2: "f32[1, 1, 4]" = torch.ops.aten.mul.Tensor(sin, 1.0); sin = None
  102.  
  103. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:141 in forward, code: return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
  104. _to_copy_4: "bf16[1, 1, 4]" = torch.ops.aten._to_copy.default(mul_1, dtype = torch.bfloat16); mul_1 = None
  105. _to_copy_5: "bf16[1, 1, 4]" = torch.ops.aten._to_copy.default(mul_2, dtype = torch.bfloat16); mul_2 = None
  106.  
  107. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:71 in forward, code: hidden_states = hidden_states.to(torch.float32)
  108. _to_copy_6: "f32[1, 1, 16]" = torch.ops.aten._to_copy.default(embedding, dtype = torch.float32)
  109.  
  110. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:72 in forward, code: variance = hidden_states.pow(2).mean(-1, keepdim=True)
  111. pow_1: "f32[1, 1, 16]" = torch.ops.aten.pow.Tensor_Scalar(_to_copy_6, 2)
  112. mean: "f32[1, 1, 1]" = torch.ops.aten.mean.dim(pow_1, [-1], True); pow_1 = None
  113.  
  114. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:73 in forward, code: hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
  115. add_1: "f32[1, 1, 1]" = torch.ops.aten.add.Tensor(mean, 1e-06); mean = None
  116. rsqrt: "f32[1, 1, 1]" = torch.ops.aten.rsqrt.default(add_1); add_1 = None
  117. mul_3: "f32[1, 1, 16]" = torch.ops.aten.mul.Tensor(_to_copy_6, rsqrt); _to_copy_6 = rsqrt = None
  118.  
  119. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:74 in forward, code: return self.weight * hidden_states.to(input_dtype)
  120. _to_copy_7: "bf16[1, 1, 16]" = torch.ops.aten._to_copy.default(mul_3, dtype = torch.bfloat16); mul_3 = None
  121. mul_4: "bf16[1, 1, 16]" = torch.ops.aten.mul.Tensor(p_model_model_layers_slice_none__2__none____modules__0___input_layernorm_weight, _to_copy_7); p_model_model_layers_slice_none__2__none____modules__0___input_layernorm_weight = _to_copy_7 = None
  122.  
  123. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:270 in forward, code: query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
  124. t: "bf16[16, 16]" = torch.ops.aten.t.default(p_model_model_layers_slice_none__2__none____modules__0___self_attn_q_proj_weight); p_model_model_layers_slice_none__2__none____modules__0___self_attn_q_proj_weight = None
  125. view_4: "bf16[1, 16]" = torch.ops.aten.view.default(mul_4, [1, 16])
  126. mm: "bf16[1, 16]" = torch.ops.aten.mm.default(view_4, t); view_4 = t = None
  127. view_5: "bf16[1, 1, 16]" = torch.ops.aten.view.default(mm, [1, 1, 16]); mm = None
  128. view_6: "bf16[1, 1, 4, 4]" = torch.ops.aten.view.default(view_5, [1, 1, -1, 4]); view_5 = None
  129. transpose_1: "bf16[1, 4, 1, 4]" = torch.ops.aten.transpose.int(view_6, 1, 2); view_6 = None
  130.  
  131. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:271 in forward, code: key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
  132. t_1: "bf16[16, 16]" = torch.ops.aten.t.default(p_model_model_layers_slice_none__2__none____modules__0___self_attn_k_proj_weight); p_model_model_layers_slice_none__2__none____modules__0___self_attn_k_proj_weight = None
  133. view_7: "bf16[1, 16]" = torch.ops.aten.view.default(mul_4, [1, 16])
  134. mm_1: "bf16[1, 16]" = torch.ops.aten.mm.default(view_7, t_1); view_7 = t_1 = None
  135. view_8: "bf16[1, 1, 16]" = torch.ops.aten.view.default(mm_1, [1, 1, 16]); mm_1 = None
  136. view_9: "bf16[1, 1, 4, 4]" = torch.ops.aten.view.default(view_8, [1, 1, -1, 4]); view_8 = None
  137. transpose_2: "bf16[1, 4, 1, 4]" = torch.ops.aten.transpose.int(view_9, 1, 2); view_9 = None
  138.  
  139. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:272 in forward, code: value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
  140. t_2: "bf16[16, 16]" = torch.ops.aten.t.default(p_model_model_layers_slice_none__2__none____modules__0___self_attn_v_proj_weight); p_model_model_layers_slice_none__2__none____modules__0___self_attn_v_proj_weight = None
  141. view_10: "bf16[1, 16]" = torch.ops.aten.view.default(mul_4, [1, 16]); mul_4 = None
  142. mm_2: "bf16[1, 16]" = torch.ops.aten.mm.default(view_10, t_2); view_10 = t_2 = None
  143. view_11: "bf16[1, 1, 16]" = torch.ops.aten.view.default(mm_2, [1, 1, 16]); mm_2 = None
  144. view_12: "bf16[1, 1, 4, 4]" = torch.ops.aten.view.default(view_11, [1, 1, -1, 4]); view_11 = None
  145. transpose_3: "bf16[1, 4, 1, 4]" = torch.ops.aten.transpose.int(view_12, 1, 2); view_12 = None
  146.  
  147. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:171 in apply_rotary_pos_emb, code: cos = cos.unsqueeze(unsqueeze_dim)
  148. unsqueeze_10: "bf16[1, 1, 1, 4]" = torch.ops.aten.unsqueeze.default(_to_copy_4, 1)
  149.  
  150. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:172 in apply_rotary_pos_emb, code: sin = sin.unsqueeze(unsqueeze_dim)
  151. unsqueeze_11: "bf16[1, 1, 1, 4]" = torch.ops.aten.unsqueeze.default(_to_copy_5, 1)
  152.  
  153. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:173 in apply_rotary_pos_emb, code: q_embed = (q * cos) + (rotate_half(q) * sin)
  154. mul_5: "bf16[1, 4, 1, 4]" = torch.ops.aten.mul.Tensor(transpose_1, unsqueeze_10)
  155.  
  156. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:146 in rotate_half, code: x1 = x[..., : x.shape[-1] // 2]
  157. slice_30: "bf16[1, 4, 1, 2]" = torch.ops.aten.slice.Tensor(transpose_1, 3, 0, 2)
  158.  
  159. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:147 in rotate_half, code: x2 = x[..., x.shape[-1] // 2 :]
  160. slice_31: "bf16[1, 4, 1, 2]" = torch.ops.aten.slice.Tensor(transpose_1, 3, 2, 9223372036854775807); transpose_1 = None
  161.  
  162. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:148 in rotate_half, code: return torch.cat((-x2, x1), dim=-1)
  163. neg: "bf16[1, 4, 1, 2]" = torch.ops.aten.neg.default(slice_31); slice_31 = None
  164. cat_1: "bf16[1, 4, 1, 4]" = torch.ops.aten.cat.default([neg, slice_30], -1); neg = slice_30 = None
  165.  
  166. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:173 in apply_rotary_pos_emb, code: q_embed = (q * cos) + (rotate_half(q) * sin)
  167. mul_6: "bf16[1, 4, 1, 4]" = torch.ops.aten.mul.Tensor(cat_1, unsqueeze_11); cat_1 = None
  168. add_2: "bf16[1, 4, 1, 4]" = torch.ops.aten.add.Tensor(mul_5, mul_6); mul_5 = mul_6 = None
  169.  
  170. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:174 in apply_rotary_pos_emb, code: k_embed = (k * cos) + (rotate_half(k) * sin)
  171. mul_7: "bf16[1, 4, 1, 4]" = torch.ops.aten.mul.Tensor(transpose_2, unsqueeze_10); unsqueeze_10 = None
  172.  
  173. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:146 in rotate_half, code: x1 = x[..., : x.shape[-1] // 2]
  174. slice_32: "bf16[1, 4, 1, 2]" = torch.ops.aten.slice.Tensor(transpose_2, 3, 0, 2)
  175.  
  176. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:147 in rotate_half, code: x2 = x[..., x.shape[-1] // 2 :]
  177. slice_33: "bf16[1, 4, 1, 2]" = torch.ops.aten.slice.Tensor(transpose_2, 3, 2, 9223372036854775807); transpose_2 = None
  178.  
  179. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:148 in rotate_half, code: return torch.cat((-x2, x1), dim=-1)
  180. neg_1: "bf16[1, 4, 1, 2]" = torch.ops.aten.neg.default(slice_33); slice_33 = None
  181. cat_2: "bf16[1, 4, 1, 4]" = torch.ops.aten.cat.default([neg_1, slice_32], -1); neg_1 = slice_32 = None
  182.  
  183. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:174 in apply_rotary_pos_emb, code: k_embed = (k * cos) + (rotate_half(k) * sin)
  184. mul_8: "bf16[1, 4, 1, 4]" = torch.ops.aten.mul.Tensor(cat_2, unsqueeze_11); cat_2 = unsqueeze_11 = None
  185. add_3: "bf16[1, 4, 1, 4]" = torch.ops.aten.add.Tensor(mul_7, mul_8); mul_7 = mul_8 = None
  186.  
  187. # File: /home/ilyas/transformers/src/transformers/cache_utils.py:1182 in update, code: key_states = key_states.to(k_out.dtype)
  188. _to_copy_8: "bf16[1, 4, 1, 4]" = torch.ops.aten._to_copy.default(add_3, dtype = torch.bfloat16); add_3 = None
  189.  
  190. # File: /home/ilyas/transformers/src/transformers/cache_utils.py:1183 in update, code: value_states = value_states.to(v_out.dtype)
  191. _to_copy_9: "bf16[1, 4, 1, 4]" = torch.ops.aten._to_copy.default(transpose_3, dtype = torch.bfloat16); transpose_3 = None
  192.  
  193. # File: /home/ilyas/transformers/src/transformers/cache_utils.py:1193 in update, code: k_out.index_copy_(2, cache_position, key_states)
  194. index_copy: "bf16[1, 4, 1234, 4]" = torch.ops.aten.index_copy.default(b___static_cache_key_cache_0, 2, cache_position, _to_copy_8); b___static_cache_key_cache_0 = _to_copy_8 = None
  195.  
  196. # File: /home/ilyas/transformers/src/transformers/cache_utils.py:1194 in update, code: v_out.index_copy_(2, cache_position, value_states)
  197. index_copy_1: "bf16[1, 4, 1234, 4]" = torch.ops.aten.index_copy.default(b___static_cache_value_cache_0, 2, cache_position, _to_copy_9); b___static_cache_value_cache_0 = _to_copy_9 = None
  198.  
  199. # File: /home/ilyas/transformers/src/transformers/integrations/sdpa_attention.py:48 in sdpa_attention_forward, code: attn_output = torch.nn.functional.scaled_dot_product_attention(
  200. slice_37: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(slice_scatter_3, 0, 0, 9223372036854775807)
  201. slice_38: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(slice_37, 1, 0, 9223372036854775807); slice_37 = None
  202. slice_39: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(slice_38, 2, 0, 9223372036854775807); slice_38 = None
  203. _scaled_dot_product_flash_attention_for_cpu = torch.ops.aten._scaled_dot_product_flash_attention_for_cpu.default(add_2, index_copy, index_copy_1, attn_mask = slice_39, scale = 0.5); add_2 = slice_39 = None
  204. getitem: "bf16[1, 4, 1, 4]" = _scaled_dot_product_flash_attention_for_cpu[0]; _scaled_dot_product_flash_attention_for_cpu = None
  205.  
  206. # File: /home/ilyas/transformers/src/transformers/integrations/sdpa_attention.py:57 in sdpa_attention_forward, code: attn_output = attn_output.transpose(1, 2).contiguous()
  207. transpose_4: "bf16[1, 1, 4, 4]" = torch.ops.aten.transpose.int(getitem, 1, 2); getitem = None
  208.  
  209. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:303 in forward, code: attn_output = attn_output.reshape(*input_shape, -1).contiguous()
  210. view_13: "bf16[1, 1, 16]" = torch.ops.aten.view.default(transpose_4, [1, 1, -1]); transpose_4 = None
  211.  
  212. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:304 in forward, code: attn_output = self.o_proj(attn_output)
  213. t_3: "bf16[16, 16]" = torch.ops.aten.t.default(p_model_model_layers_slice_none__2__none____modules__0___self_attn_o_proj_weight); p_model_model_layers_slice_none__2__none____modules__0___self_attn_o_proj_weight = None
  214. view_14: "bf16[1, 16]" = torch.ops.aten.view.default(view_13, [1, 16]); view_13 = None
  215. mm_3: "bf16[1, 16]" = torch.ops.aten.mm.default(view_14, t_3); view_14 = t_3 = None
  216. view_15: "bf16[1, 1, 16]" = torch.ops.aten.view.default(mm_3, [1, 1, 16]); mm_3 = None
  217.  
  218. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:347 in forward, code: hidden_states = residual + hidden_states
  219. add_4: "bf16[1, 1, 16]" = torch.ops.aten.add.Tensor(embedding, view_15); embedding = view_15 = None
  220.  
  221. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:71 in forward, code: hidden_states = hidden_states.to(torch.float32)
  222. _to_copy_10: "f32[1, 1, 16]" = torch.ops.aten._to_copy.default(add_4, dtype = torch.float32)
  223.  
  224. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:72 in forward, code: variance = hidden_states.pow(2).mean(-1, keepdim=True)
  225. pow_2: "f32[1, 1, 16]" = torch.ops.aten.pow.Tensor_Scalar(_to_copy_10, 2)
  226. mean_1: "f32[1, 1, 1]" = torch.ops.aten.mean.dim(pow_2, [-1], True); pow_2 = None
  227.  
  228. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:73 in forward, code: hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
  229. add_5: "f32[1, 1, 1]" = torch.ops.aten.add.Tensor(mean_1, 1e-06); mean_1 = None
  230. rsqrt_1: "f32[1, 1, 1]" = torch.ops.aten.rsqrt.default(add_5); add_5 = None
  231. mul_9: "f32[1, 1, 16]" = torch.ops.aten.mul.Tensor(_to_copy_10, rsqrt_1); _to_copy_10 = rsqrt_1 = None
  232.  
  233. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:74 in forward, code: return self.weight * hidden_states.to(input_dtype)
  234. _to_copy_11: "bf16[1, 1, 16]" = torch.ops.aten._to_copy.default(mul_9, dtype = torch.bfloat16); mul_9 = None
  235. mul_10: "bf16[1, 1, 16]" = torch.ops.aten.mul.Tensor(p_model_model_layers_slice_none__2__none____modules__0___post_attention_layernorm_weight, _to_copy_11); p_model_model_layers_slice_none__2__none____modules__0___post_attention_layernorm_weight = _to_copy_11 = None
  236.  
  237. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:190 in forward, code: down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
  238. t_4: "bf16[16, 64]" = torch.ops.aten.t.default(p_model_model_layers_slice_none__2__none____modules__0___mlp_gate_proj_weight); p_model_model_layers_slice_none__2__none____modules__0___mlp_gate_proj_weight = None
  239. view_16: "bf16[1, 16]" = torch.ops.aten.view.default(mul_10, [1, 16])
  240. mm_4: "bf16[1, 64]" = torch.ops.aten.mm.default(view_16, t_4); view_16 = t_4 = None
  241. view_17: "bf16[1, 1, 64]" = torch.ops.aten.view.default(mm_4, [1, 1, 64]); mm_4 = None
  242. silu: "bf16[1, 1, 64]" = torch.ops.aten.silu.default(view_17); view_17 = None
  243. t_5: "bf16[16, 64]" = torch.ops.aten.t.default(p_model_model_layers_slice_none__2__none____modules__0___mlp_up_proj_weight); p_model_model_layers_slice_none__2__none____modules__0___mlp_up_proj_weight = None
  244. view_18: "bf16[1, 16]" = torch.ops.aten.view.default(mul_10, [1, 16]); mul_10 = None
  245. mm_5: "bf16[1, 64]" = torch.ops.aten.mm.default(view_18, t_5); view_18 = t_5 = None
  246. view_19: "bf16[1, 1, 64]" = torch.ops.aten.view.default(mm_5, [1, 1, 64]); mm_5 = None
  247. mul_11: "bf16[1, 1, 64]" = torch.ops.aten.mul.Tensor(silu, view_19); silu = view_19 = None
  248. t_6: "bf16[64, 16]" = torch.ops.aten.t.default(p_model_model_layers_slice_none__2__none____modules__0___mlp_down_proj_weight); p_model_model_layers_slice_none__2__none____modules__0___mlp_down_proj_weight = None
  249. view_20: "bf16[1, 64]" = torch.ops.aten.view.default(mul_11, [1, 64]); mul_11 = None
  250. mm_6: "bf16[1, 16]" = torch.ops.aten.mm.default(view_20, t_6); view_20 = t_6 = None
  251. view_21: "bf16[1, 1, 16]" = torch.ops.aten.view.default(mm_6, [1, 1, 16]); mm_6 = None
  252.  
  253. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:353 in forward, code: hidden_states = residual + hidden_states
  254. add_6: "bf16[1, 1, 16]" = torch.ops.aten.add.Tensor(add_4, view_21); add_4 = view_21 = None
  255.  
  256. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:71 in forward, code: hidden_states = hidden_states.to(torch.float32)
  257. _to_copy_12: "f32[1, 1, 16]" = torch.ops.aten._to_copy.default(add_6, dtype = torch.float32)
  258.  
  259. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:72 in forward, code: variance = hidden_states.pow(2).mean(-1, keepdim=True)
  260. pow_3: "f32[1, 1, 16]" = torch.ops.aten.pow.Tensor_Scalar(_to_copy_12, 2)
  261. mean_2: "f32[1, 1, 1]" = torch.ops.aten.mean.dim(pow_3, [-1], True); pow_3 = None
  262.  
  263. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:73 in forward, code: hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
  264. add_7: "f32[1, 1, 1]" = torch.ops.aten.add.Tensor(mean_2, 1e-06); mean_2 = None
  265. rsqrt_2: "f32[1, 1, 1]" = torch.ops.aten.rsqrt.default(add_7); add_7 = None
  266. mul_12: "f32[1, 1, 16]" = torch.ops.aten.mul.Tensor(_to_copy_12, rsqrt_2); _to_copy_12 = rsqrt_2 = None
  267.  
  268. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:74 in forward, code: return self.weight * hidden_states.to(input_dtype)
  269. _to_copy_13: "bf16[1, 1, 16]" = torch.ops.aten._to_copy.default(mul_12, dtype = torch.bfloat16); mul_12 = None
  270. mul_13: "bf16[1, 1, 16]" = torch.ops.aten.mul.Tensor(p_model_model_layers_slice_none__2__none____modules__1___input_layernorm_weight, _to_copy_13); p_model_model_layers_slice_none__2__none____modules__1___input_layernorm_weight = _to_copy_13 = None
  271.  
  272. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:270 in forward, code: query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
  273. t_7: "bf16[16, 16]" = torch.ops.aten.t.default(p_model_model_layers_slice_none__2__none____modules__1___self_attn_q_proj_weight); p_model_model_layers_slice_none__2__none____modules__1___self_attn_q_proj_weight = None
  274. view_22: "bf16[1, 16]" = torch.ops.aten.view.default(mul_13, [1, 16])
  275. mm_7: "bf16[1, 16]" = torch.ops.aten.mm.default(view_22, t_7); view_22 = t_7 = None
  276. view_23: "bf16[1, 1, 16]" = torch.ops.aten.view.default(mm_7, [1, 1, 16]); mm_7 = None
  277. view_24: "bf16[1, 1, 4, 4]" = torch.ops.aten.view.default(view_23, [1, 1, -1, 4]); view_23 = None
  278. transpose_5: "bf16[1, 4, 1, 4]" = torch.ops.aten.transpose.int(view_24, 1, 2); view_24 = None
  279.  
  280. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:271 in forward, code: key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
  281. t_8: "bf16[16, 16]" = torch.ops.aten.t.default(p_model_model_layers_slice_none__2__none____modules__1___self_attn_k_proj_weight); p_model_model_layers_slice_none__2__none____modules__1___self_attn_k_proj_weight = None
  282. view_25: "bf16[1, 16]" = torch.ops.aten.view.default(mul_13, [1, 16])
  283. mm_8: "bf16[1, 16]" = torch.ops.aten.mm.default(view_25, t_8); view_25 = t_8 = None
  284. view_26: "bf16[1, 1, 16]" = torch.ops.aten.view.default(mm_8, [1, 1, 16]); mm_8 = None
  285. view_27: "bf16[1, 1, 4, 4]" = torch.ops.aten.view.default(view_26, [1, 1, -1, 4]); view_26 = None
  286. transpose_6: "bf16[1, 4, 1, 4]" = torch.ops.aten.transpose.int(view_27, 1, 2); view_27 = None
  287.  
  288. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:272 in forward, code: value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
  289. t_9: "bf16[16, 16]" = torch.ops.aten.t.default(p_model_model_layers_slice_none__2__none____modules__1___self_attn_v_proj_weight); p_model_model_layers_slice_none__2__none____modules__1___self_attn_v_proj_weight = None
  290. view_28: "bf16[1, 16]" = torch.ops.aten.view.default(mul_13, [1, 16]); mul_13 = None
  291. mm_9: "bf16[1, 16]" = torch.ops.aten.mm.default(view_28, t_9); view_28 = t_9 = None
  292. view_29: "bf16[1, 1, 16]" = torch.ops.aten.view.default(mm_9, [1, 1, 16]); mm_9 = None
  293. view_30: "bf16[1, 1, 4, 4]" = torch.ops.aten.view.default(view_29, [1, 1, -1, 4]); view_29 = None
  294. transpose_7: "bf16[1, 4, 1, 4]" = torch.ops.aten.transpose.int(view_30, 1, 2); view_30 = None
  295.  
  296. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:171 in apply_rotary_pos_emb, code: cos = cos.unsqueeze(unsqueeze_dim)
  297. unsqueeze_12: "bf16[1, 1, 1, 4]" = torch.ops.aten.unsqueeze.default(_to_copy_4, 1); _to_copy_4 = None
  298.  
  299. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:172 in apply_rotary_pos_emb, code: sin = sin.unsqueeze(unsqueeze_dim)
  300. unsqueeze_13: "bf16[1, 1, 1, 4]" = torch.ops.aten.unsqueeze.default(_to_copy_5, 1); _to_copy_5 = None
  301.  
  302. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:173 in apply_rotary_pos_emb, code: q_embed = (q * cos) + (rotate_half(q) * sin)
  303. mul_14: "bf16[1, 4, 1, 4]" = torch.ops.aten.mul.Tensor(transpose_5, unsqueeze_12)
  304.  
  305. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:146 in rotate_half, code: x1 = x[..., : x.shape[-1] // 2]
  306. slice_40: "bf16[1, 4, 1, 2]" = torch.ops.aten.slice.Tensor(transpose_5, 3, 0, 2)
  307.  
  308. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:147 in rotate_half, code: x2 = x[..., x.shape[-1] // 2 :]
  309. slice_41: "bf16[1, 4, 1, 2]" = torch.ops.aten.slice.Tensor(transpose_5, 3, 2, 9223372036854775807); transpose_5 = None
  310.  
  311. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:148 in rotate_half, code: return torch.cat((-x2, x1), dim=-1)
  312. neg_2: "bf16[1, 4, 1, 2]" = torch.ops.aten.neg.default(slice_41); slice_41 = None
  313. cat_3: "bf16[1, 4, 1, 4]" = torch.ops.aten.cat.default([neg_2, slice_40], -1); neg_2 = slice_40 = None
  314.  
  315. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:173 in apply_rotary_pos_emb, code: q_embed = (q * cos) + (rotate_half(q) * sin)
  316. mul_15: "bf16[1, 4, 1, 4]" = torch.ops.aten.mul.Tensor(cat_3, unsqueeze_13); cat_3 = None
  317. add_8: "bf16[1, 4, 1, 4]" = torch.ops.aten.add.Tensor(mul_14, mul_15); mul_14 = mul_15 = None
  318.  
  319. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:174 in apply_rotary_pos_emb, code: k_embed = (k * cos) + (rotate_half(k) * sin)
  320. mul_16: "bf16[1, 4, 1, 4]" = torch.ops.aten.mul.Tensor(transpose_6, unsqueeze_12); unsqueeze_12 = None
  321.  
  322. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:146 in rotate_half, code: x1 = x[..., : x.shape[-1] // 2]
  323. slice_42: "bf16[1, 4, 1, 2]" = torch.ops.aten.slice.Tensor(transpose_6, 3, 0, 2)
  324.  
  325. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:147 in rotate_half, code: x2 = x[..., x.shape[-1] // 2 :]
  326. slice_43: "bf16[1, 4, 1, 2]" = torch.ops.aten.slice.Tensor(transpose_6, 3, 2, 9223372036854775807); transpose_6 = None
  327.  
  328. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:148 in rotate_half, code: return torch.cat((-x2, x1), dim=-1)
  329. neg_3: "bf16[1, 4, 1, 2]" = torch.ops.aten.neg.default(slice_43); slice_43 = None
  330. cat_4: "bf16[1, 4, 1, 4]" = torch.ops.aten.cat.default([neg_3, slice_42], -1); neg_3 = slice_42 = None
  331.  
  332. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:174 in apply_rotary_pos_emb, code: k_embed = (k * cos) + (rotate_half(k) * sin)
  333. mul_17: "bf16[1, 4, 1, 4]" = torch.ops.aten.mul.Tensor(cat_4, unsqueeze_13); cat_4 = unsqueeze_13 = None
  334. add_9: "bf16[1, 4, 1, 4]" = torch.ops.aten.add.Tensor(mul_16, mul_17); mul_16 = mul_17 = None
  335.  
  336. # File: /home/ilyas/transformers/src/transformers/cache_utils.py:1182 in update, code: key_states = key_states.to(k_out.dtype)
  337. _to_copy_14: "bf16[1, 4, 1, 4]" = torch.ops.aten._to_copy.default(add_9, dtype = torch.bfloat16); add_9 = None
  338.  
  339. # File: /home/ilyas/transformers/src/transformers/cache_utils.py:1183 in update, code: value_states = value_states.to(v_out.dtype)
  340. _to_copy_15: "bf16[1, 4, 1, 4]" = torch.ops.aten._to_copy.default(transpose_7, dtype = torch.bfloat16); transpose_7 = None
  341.  
  342. # File: /home/ilyas/transformers/src/transformers/cache_utils.py:1193 in update, code: k_out.index_copy_(2, cache_position, key_states)
  343. index_copy_2: "bf16[1, 4, 1234, 4]" = torch.ops.aten.index_copy.default(b___static_cache_key_cache_1, 2, cache_position, _to_copy_14); b___static_cache_key_cache_1 = _to_copy_14 = None
  344.  
  345. # File: /home/ilyas/transformers/src/transformers/cache_utils.py:1194 in update, code: v_out.index_copy_(2, cache_position, value_states)
  346. index_copy_3: "bf16[1, 4, 1234, 4]" = torch.ops.aten.index_copy.default(b___static_cache_value_cache_1, 2, cache_position, _to_copy_15); b___static_cache_value_cache_1 = cache_position = _to_copy_15 = None
  347.  
  348. # File: /home/ilyas/transformers/src/transformers/integrations/sdpa_attention.py:48 in sdpa_attention_forward, code: attn_output = torch.nn.functional.scaled_dot_product_attention(
  349. slice_47: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(slice_scatter_3, 0, 0, 9223372036854775807); slice_scatter_3 = None
  350. slice_48: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(slice_47, 1, 0, 9223372036854775807); slice_47 = None
  351. slice_49: "bf16[1, 1, 1, 1234]" = torch.ops.aten.slice.Tensor(slice_48, 2, 0, 9223372036854775807); slice_48 = None
  352. _scaled_dot_product_flash_attention_for_cpu_1 = torch.ops.aten._scaled_dot_product_flash_attention_for_cpu.default(add_8, index_copy_2, index_copy_3, attn_mask = slice_49, scale = 0.5); add_8 = slice_49 = None
  353. getitem_2: "bf16[1, 4, 1, 4]" = _scaled_dot_product_flash_attention_for_cpu_1[0]; _scaled_dot_product_flash_attention_for_cpu_1 = None
  354.  
  355. # File: /home/ilyas/transformers/src/transformers/integrations/sdpa_attention.py:57 in sdpa_attention_forward, code: attn_output = attn_output.transpose(1, 2).contiguous()
  356. transpose_8: "bf16[1, 1, 4, 4]" = torch.ops.aten.transpose.int(getitem_2, 1, 2); getitem_2 = None
  357.  
  358. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:303 in forward, code: attn_output = attn_output.reshape(*input_shape, -1).contiguous()
  359. view_31: "bf16[1, 1, 16]" = torch.ops.aten.view.default(transpose_8, [1, 1, -1]); transpose_8 = None
  360.  
  361. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:304 in forward, code: attn_output = self.o_proj(attn_output)
  362. t_10: "bf16[16, 16]" = torch.ops.aten.t.default(p_model_model_layers_slice_none__2__none____modules__1___self_attn_o_proj_weight); p_model_model_layers_slice_none__2__none____modules__1___self_attn_o_proj_weight = None
  363. view_32: "bf16[1, 16]" = torch.ops.aten.view.default(view_31, [1, 16]); view_31 = None
  364. mm_10: "bf16[1, 16]" = torch.ops.aten.mm.default(view_32, t_10); view_32 = t_10 = None
  365. view_33: "bf16[1, 1, 16]" = torch.ops.aten.view.default(mm_10, [1, 1, 16]); mm_10 = None
  366.  
  367. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:347 in forward, code: hidden_states = residual + hidden_states
  368. add_10: "bf16[1, 1, 16]" = torch.ops.aten.add.Tensor(add_6, view_33); add_6 = view_33 = None
  369.  
  370. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:71 in forward, code: hidden_states = hidden_states.to(torch.float32)
  371. _to_copy_16: "f32[1, 1, 16]" = torch.ops.aten._to_copy.default(add_10, dtype = torch.float32)
  372.  
  373. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:72 in forward, code: variance = hidden_states.pow(2).mean(-1, keepdim=True)
  374. pow_4: "f32[1, 1, 16]" = torch.ops.aten.pow.Tensor_Scalar(_to_copy_16, 2)
  375. mean_3: "f32[1, 1, 1]" = torch.ops.aten.mean.dim(pow_4, [-1], True); pow_4 = None
  376.  
  377. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:73 in forward, code: hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
  378. add_11: "f32[1, 1, 1]" = torch.ops.aten.add.Tensor(mean_3, 1e-06); mean_3 = None
  379. rsqrt_3: "f32[1, 1, 1]" = torch.ops.aten.rsqrt.default(add_11); add_11 = None
  380. mul_18: "f32[1, 1, 16]" = torch.ops.aten.mul.Tensor(_to_copy_16, rsqrt_3); _to_copy_16 = rsqrt_3 = None
  381.  
  382. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:74 in forward, code: return self.weight * hidden_states.to(input_dtype)
  383. _to_copy_17: "bf16[1, 1, 16]" = torch.ops.aten._to_copy.default(mul_18, dtype = torch.bfloat16); mul_18 = None
  384. mul_19: "bf16[1, 1, 16]" = torch.ops.aten.mul.Tensor(p_model_model_layers_slice_none__2__none____modules__1___post_attention_layernorm_weight, _to_copy_17); p_model_model_layers_slice_none__2__none____modules__1___post_attention_layernorm_weight = _to_copy_17 = None
  385.  
  386. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:190 in forward, code: down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
  387. t_11: "bf16[16, 64]" = torch.ops.aten.t.default(p_model_model_layers_slice_none__2__none____modules__1___mlp_gate_proj_weight); p_model_model_layers_slice_none__2__none____modules__1___mlp_gate_proj_weight = None
  388. view_34: "bf16[1, 16]" = torch.ops.aten.view.default(mul_19, [1, 16])
  389. mm_11: "bf16[1, 64]" = torch.ops.aten.mm.default(view_34, t_11); view_34 = t_11 = None
  390. view_35: "bf16[1, 1, 64]" = torch.ops.aten.view.default(mm_11, [1, 1, 64]); mm_11 = None
  391. silu_1: "bf16[1, 1, 64]" = torch.ops.aten.silu.default(view_35); view_35 = None
  392. t_12: "bf16[16, 64]" = torch.ops.aten.t.default(p_model_model_layers_slice_none__2__none____modules__1___mlp_up_proj_weight); p_model_model_layers_slice_none__2__none____modules__1___mlp_up_proj_weight = None
  393. view_36: "bf16[1, 16]" = torch.ops.aten.view.default(mul_19, [1, 16]); mul_19 = None
  394. mm_12: "bf16[1, 64]" = torch.ops.aten.mm.default(view_36, t_12); view_36 = t_12 = None
  395. view_37: "bf16[1, 1, 64]" = torch.ops.aten.view.default(mm_12, [1, 1, 64]); mm_12 = None
  396. mul_20: "bf16[1, 1, 64]" = torch.ops.aten.mul.Tensor(silu_1, view_37); silu_1 = view_37 = None
  397. t_13: "bf16[64, 16]" = torch.ops.aten.t.default(p_model_model_layers_slice_none__2__none____modules__1___mlp_down_proj_weight); p_model_model_layers_slice_none__2__none____modules__1___mlp_down_proj_weight = None
  398. view_38: "bf16[1, 64]" = torch.ops.aten.view.default(mul_20, [1, 64]); mul_20 = None
  399. mm_13: "bf16[1, 16]" = torch.ops.aten.mm.default(view_38, t_13); view_38 = t_13 = None
  400. view_39: "bf16[1, 1, 16]" = torch.ops.aten.view.default(mm_13, [1, 1, 16]); mm_13 = None
  401.  
  402. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:353 in forward, code: hidden_states = residual + hidden_states
  403. add_12: "bf16[1, 1, 16]" = torch.ops.aten.add.Tensor(add_10, view_39); add_10 = view_39 = None
  404.  
  405. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:71 in forward, code: hidden_states = hidden_states.to(torch.float32)
  406. _to_copy_18: "f32[1, 1, 16]" = torch.ops.aten._to_copy.default(add_12, dtype = torch.float32); add_12 = None
  407.  
  408. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:72 in forward, code: variance = hidden_states.pow(2).mean(-1, keepdim=True)
  409. pow_5: "f32[1, 1, 16]" = torch.ops.aten.pow.Tensor_Scalar(_to_copy_18, 2)
  410. mean_4: "f32[1, 1, 1]" = torch.ops.aten.mean.dim(pow_5, [-1], True); pow_5 = None
  411.  
  412. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:73 in forward, code: hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
  413. add_13: "f32[1, 1, 1]" = torch.ops.aten.add.Tensor(mean_4, 1e-06); mean_4 = None
  414. rsqrt_4: "f32[1, 1, 1]" = torch.ops.aten.rsqrt.default(add_13); add_13 = None
  415. mul_21: "f32[1, 1, 16]" = torch.ops.aten.mul.Tensor(_to_copy_18, rsqrt_4); _to_copy_18 = rsqrt_4 = None
  416.  
  417. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:74 in forward, code: return self.weight * hidden_states.to(input_dtype)
  418. _to_copy_19: "bf16[1, 1, 16]" = torch.ops.aten._to_copy.default(mul_21, dtype = torch.bfloat16); mul_21 = None
  419. mul_22: "bf16[1, 1, 16]" = torch.ops.aten.mul.Tensor(p_model_model_norm_weight, _to_copy_19); p_model_model_norm_weight = _to_copy_19 = None
  420.  
  421. # File: /home/ilyas/transformers/src/transformers/models/llama/modeling_llama.py:856 in forward, code: logits = self.lm_head(hidden_states[:, slice_indices, :])
  422. slice_50: "bf16[1, 1, 16]" = torch.ops.aten.slice.Tensor(mul_22, 0, 0, 9223372036854775807); mul_22 = None
  423. slice_51: "bf16[1, 1, 16]" = torch.ops.aten.slice.Tensor(slice_50, 1, 0, 9223372036854775807); slice_50 = None
  424. slice_52: "bf16[1, 1, 16]" = torch.ops.aten.slice.Tensor(slice_51, 2, 0, 9223372036854775807); slice_51 = None
  425. t_14: "bf16[16, 32000]" = torch.ops.aten.t.default(p_model_lm_head_weight); p_model_lm_head_weight = None
  426. view_40: "bf16[1, 16]" = torch.ops.aten.view.default(slice_52, [1, 16]); slice_52 = None
  427. mm_14: "bf16[1, 32000]" = torch.ops.aten.mm.default(view_40, t_14); view_40 = t_14 = None
  428. view_41: "bf16[1, 1, 32000]" = torch.ops.aten.view.default(mm_14, [1, 1, 32000]); mm_14 = None
  429. return (index_copy, index_copy_1, index_copy_2, index_copy_3, view_41)
  430.  
  431. Graph signature: ExportGraphSignature(input_specs=[InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__0___input_layernorm_weight'), target='model.model.layers.0.input_layernorm.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__0___post_attention_layernorm_weight'), target='model.model.layers.0.post_attention_layernorm.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__1___input_layernorm_weight'), target='model.model.layers.1.input_layernorm.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__1___post_attention_layernorm_weight'), target='model.model.layers.1.post_attention_layernorm.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_norm_weight'), target='model.model.norm.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_embed_tokens_weight'), target='model.model.embed_tokens.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__0___self_attn_q_proj_weight'), target='model.model.layers.0.self_attn.q_proj.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__0___self_attn_k_proj_weight'), target='model.model.layers.0.self_attn.k_proj.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__0___self_attn_v_proj_weight'), target='model.model.layers.0.self_attn.v_proj.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__0___self_attn_o_proj_weight'), target='model.model.layers.0.self_attn.o_proj.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__0___mlp_gate_proj_weight'), target='model.model.layers.0.mlp.gate_proj.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__0___mlp_up_proj_weight'), target='model.model.layers.0.mlp.up_proj.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__0___mlp_down_proj_weight'), target='model.model.layers.0.mlp.down_proj.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__1___self_attn_q_proj_weight'), target='model.model.layers.1.self_attn.q_proj.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__1___self_attn_k_proj_weight'), target='model.model.layers.1.self_attn.k_proj.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__1___self_attn_v_proj_weight'), target='model.model.layers.1.self_attn.v_proj.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__1___self_attn_o_proj_weight'), target='model.model.layers.1.self_attn.o_proj.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__1___mlp_gate_proj_weight'), target='model.model.layers.1.mlp.gate_proj.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__1___mlp_up_proj_weight'), target='model.model.layers.1.mlp.up_proj.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_model_layers_slice_none__2__none____modules__1___mlp_down_proj_weight'), target='model.model.layers.1.mlp.down_proj.weight', persistent=None), InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='p_model_lm_head_weight'), target='model.lm_head.weight', persistent=None), InputSpec(kind=<InputKind.BUFFER: 3>, arg=TensorArgument(name='b_mask'), target='mask', persistent=False), InputSpec(kind=<InputKind.BUFFER: 3>, arg=TensorArgument(name='b___static_cache_key_cache_0'), target='key_cache_0', persistent=False), InputSpec(kind=<InputKind.BUFFER: 3>, arg=TensorArgument(name='b_model_model_rotary_emb_inv_freq'), target='model.model.rotary_emb.inv_freq', persistent=False), InputSpec(kind=<InputKind.BUFFER: 3>, arg=TensorArgument(name='b___static_cache_value_cache_0'), target='value_cache_0', persistent=False), InputSpec(kind=<InputKind.BUFFER: 3>, arg=TensorArgument(name='b___static_cache_key_cache_1'), target='key_cache_1', persistent=False), InputSpec(kind=<InputKind.BUFFER: 3>, arg=TensorArgument(name='b___static_cache_value_cache_1'), target='value_cache_1', persistent=False), InputSpec(kind=<InputKind.USER_INPUT: 1>, arg=TensorArgument(name='input_ids'), target=None, persistent=None), InputSpec(kind=<InputKind.USER_INPUT: 1>, arg=TensorArgument(name='cache_position'), target=None, persistent=None)], output_specs=[OutputSpec(kind=<OutputKind.BUFFER_MUTATION: 3>, arg=TensorArgument(name='index_copy'), target='key_cache_0'), OutputSpec(kind=<OutputKind.BUFFER_MUTATION: 3>, arg=TensorArgument(name='index_copy_1'), target='value_cache_0'), OutputSpec(kind=<OutputKind.BUFFER_MUTATION: 3>, arg=TensorArgument(name='index_copy_2'), target='key_cache_1'), OutputSpec(kind=<OutputKind.BUFFER_MUTATION: 3>, arg=TensorArgument(name='index_copy_3'), target='value_cache_1'), OutputSpec(kind=<OutputKind.USER_OUTPUT: 1>, arg=TensorArgument(name='view_41'), target=None)])
  432. Range constraints: {}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement