here2share

# finetune_reasoning.py zzz

Oct 10th, 2025
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.37 KB | None | 0 0
  1. # finetune_reasoning.py
  2.  
  3. import math
  4. import random
  5.  
  6. def matmul(a, b):
  7.     rows_a, cols_a = len(a), len(a[0])
  8.     rows_b, cols_b = len(b), len(b[0])
  9.     result = [[sum(a[i][k] * b[k][j] for k in range(cols_a)) for j in range(cols_b)] for i in range(rows_a)]
  10.     return result
  11.  
  12. def transpose(matrix):
  13.     return [[matrix[j][i] for j in range(len(matrix))] for i in range(len(matrix[0]))]
  14.  
  15. def softmax(x):
  16.     exp_x = [math.exp(i - max(x)) for i in x]
  17.     sum_exp = sum(exp_x)
  18.     return [i / sum_exp for i in exp_x]
  19.  
  20. def layer_norm(x, eps=1e-5):
  21.     mean = sum(x) / len(x)
  22.     variance = sum((i - mean) ** 2 for i in x) / len(x)
  23.     return [(i - mean) / math.sqrt(variance + eps) for i in x]
  24.  
  25. def gelu(x):
  26.     return 0.5 * x * (1 + math.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * x ** 3)))
  27.  
  28. def attention(q, k, v, d_k):
  29.     scores = [[sum(q[i][m] * k[j][m] for m in range(len(q[0]))) / math.sqrt(d_k) for j in range(len(k))] for i in range(len(q))]
  30.     attn_weights = [softmax(row) for row in scores]
  31.     output = [[sum(attn_weights[i][j] * v[j][m] for j in range(len(v))) for m in range(len(v[0]))] for i in range(len(attn_weights))]
  32.     return output
  33.  
  34. def multi_head_attention(x, num_heads, d_model, wq, wk, wv, wo):
  35.     d_k = d_model // num_heads
  36.     q = matmul(x, wq)
  37.     k = matmul(x, wk)
  38.     v = matmul(x, wv)
  39.    
  40.     seq_len = len(x)
  41.     q_heads = [[[q[i][h * d_k + j] for j in range(d_k)] for i in range(seq_len)] for h in range(num_heads)]
  42.     k_heads = [[[k[i][h * d_k + j] for j in range(d_k)] for i in range(seq_len)] for h in range(num_heads)]
  43.     v_heads = [[[v[i][h * d_k + j] for j in range(d_k)] for i in range(seq_len)] for h in range(num_heads)]
  44.    
  45.     attn_outputs = [attention(q_heads[h], k_heads[h], v_heads[h], d_k) for h in range(num_heads)]
  46.    
  47.     concat = [[attn_outputs[h][i][j] for h in range(num_heads) for j in range(d_k)] for i in range(seq_len)]
  48.    
  49.     output = matmul(concat, wo)
  50.     return output
  51.  
  52. def feed_forward(x, w1, b1, w2, b2):
  53.     hidden = [[gelu(sum(x[i][j] * w1[j][k] for j in range(len(x[0]))) + b1[k]) for k in range(len(w1[0]))] for i in range(len(x))]
  54.     output = [[sum(hidden[i][j] * w2[j][k] for j in range(len(hidden[0]))) + b2[k] for k in range(len(w2[0]))] for i in range(len(hidden))]
  55.     return output
  56.  
  57. def transformer_block(x, num_heads, d_model, d_ff, wq, wk, wv, wo, w1, b1, w2, b2):
  58.     attn_out = multi_head_attention(x, num_heads, d_model, wq, wk, wv, wo)
  59.     x = [[x[i][j] + attn_out[i][j] for j in range(len(x[0]))] for i in range(len(x))]
  60.     x = [layer_norm(row) for row in x]
  61.    
  62.     ff_out = feed_forward(x, w1, b1, w2, b2)
  63.     x = [[x[i][j] + ff_out[i][j] for j in range(len(x[0]))] for i in range(len(x))]
  64.     x = [layer_norm(row) for row in x]
  65.    
  66.     return x
  67.  
  68. def encoder(tokens, num_layers, num_heads, d_model, d_ff, params):
  69.     x = tokens
  70.     for layer in range(num_layers):
  71.         x = transformer_block(x, num_heads, d_model, d_ff,
  72.                             params[f'enc_l{layer}_wq'], params[f'enc_l{layer}_wk'],
  73.                             params[f'enc_l{layer}_wv'], params[f'enc_l{layer}_wo'],
  74.                             params[f'enc_l{layer}_w1'], params[f'enc_l{layer}_b1'],
  75.                             params[f'enc_l{layer}_w2'], params[f'enc_l{layer}_b2'])
  76.    
  77.     pooled = [sum(x[i][j] for i in range(len(x))) / len(x) for j in range(d_model)]
  78.     latent = [sum(pooled[j] * params['enc_proj'][j][k] for j in range(len(pooled))) for k in range(1024)]
  79.    
  80.     return latent
  81.  
  82. def latent_manipulator(latent, depth, width, params):
  83.     x = latent[:]
  84.    
  85.     for layer in range(depth):
  86.         residual = x[:]
  87.        
  88.         hidden = [gelu(sum(x[j] * params[f'lm_l{layer}_w1'][j][k] for j in range(len(x))) + params[f'lm_l{layer}_b1'][k]) for k in range(width)]
  89.         x = [sum(hidden[j] * params[f'lm_l{layer}_w2'][j][k] for j in range(len(hidden))) + params[f'lm_l{layer}_b2'][k] for k in range(len(x))]
  90.        
  91.         x = [x[i] + residual[i] for i in range(len(x))]
  92.         x = layer_norm(x)
  93.    
  94.     return x
  95.  
  96. def decoder(latent, max_len, num_layers, num_heads, d_model, d_ff, vocab_size, params):
  97.     x = [[latent[j] * params['dec_proj'][j][k] for k in range(d_model)] for _ in range(max_len)]
  98.    
  99.     for layer in range(num_layers):
  100.         x = transformer_block(x, num_heads, d_model, d_ff,
  101.                             params[f'dec_l{layer}_wq'], params[f'dec_l{layer}_wk'],
  102.                             params[f'dec_l{layer}_wv'], params[f'dec_l{layer}_wo'],
  103.                             params[f'dec_l{layer}_w1'], params[f'dec_l{layer}_b1'],
  104.                             params[f'dec_l{layer}_w2'], params[f'dec_l{layer}_b2'])
  105.    
  106.     logits = [[sum(x[i][j] * params['output_proj'][j][k] for j in range(d_model)) for k in range(vocab_size)] for i in range(len(x))]
  107.    
  108.     return logits
  109.  
  110. def init_params(num_enc_layers, num_dec_layers, num_heads, d_model, d_ff, vocab_size, lm_depth, lm_width):
  111.     params = {}
  112.    
  113.     for layer in range(num_enc_layers):
  114.         params[f'enc_l{layer}_wq'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
  115.         params[f'enc_l{layer}_wk'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
  116.         params[f'enc_l{layer}_wv'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
  117.         params[f'enc_l{layer}_wo'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
  118.         params[f'enc_l{layer}_w1'] = [[random.gauss(0, 0.02) for _ in range(d_ff)] for _ in range(d_model)]
  119.         params[f'enc_l{layer}_b1'] = [0.0] * d_ff
  120.         params[f'enc_l{layer}_w2'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_ff)]
  121.         params[f'enc_l{layer}_b2'] = [0.0] * d_model
  122.    
  123.     params['enc_proj'] = [[random.gauss(0, 0.02) for _ in range(1024)] for _ in range(d_model)]
  124.    
  125.     for layer in range(lm_depth):
  126.         params[f'lm_l{layer}_w1'] = [[random.gauss(0, 0.02) for _ in range(lm_width)] for _ in range(1024)]
  127.         params[f'lm_l{layer}_b1'] = [0.0] * lm_width
  128.         params[f'lm_l{layer}_w2'] = [[random.gauss(0, 0.02) for _ in range(1024)] for _ in range(lm_width)]
  129.         params[f'lm_l{layer}_b2'] = [0.0] * 1024
  130.    
  131.     params['dec_proj'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(1024)]
  132.    
  133.     for layer in range(num_dec_layers):
  134.         params[f'dec_l{layer}_wq'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
  135.         params[f'dec_l{layer}_wk'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
  136.         params[f'dec_l{layer}_wv'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
  137.         params[f'dec_l{layer}_wo'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
  138.         params[f'dec_l{layer}_w1'] = [[random.gauss(0, 0.02) for _ in range(d_ff)] for _ in range(d_model)]
  139.         params[f'dec_l{layer}_b1'] = [0.0] * d_ff
  140.         params[f'dec_l{layer}_w2'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_ff)]
  141.         params[f'dec_l{layer}_b2'] = [0.0] * d_model
  142.    
  143.     params['output_proj'] = [[random.gauss(0, 0.02) for _ in range(vocab_size)] for _ in range(d_model)]
  144.    
  145.     return params
  146.  
  147. def forward_pass(input_tokens, num_enc_layers, num_dec_layers, num_heads, d_model, d_ff, vocab_size, lm_depth, lm_width, max_output_len, params):
  148.     question_latent = encoder(input_tokens, num_enc_layers, num_heads, d_model, d_ff, params)
  149.     answer_latent = latent_manipulator(question_latent, lm_depth, lm_width, params)
  150.     output_logits = decoder(answer_latent, max_output_len, num_dec_layers, num_heads, d_model, d_ff, vocab_size, params)
  151.     return output_logits
  152.  
  153. num_enc_layers = 4
  154. num_dec_layers = 4
  155. num_heads = 8
  156. d_model = 256
  157. d_ff = 1024
  158. vocab_size = 5000
  159. lm_depth = 8
  160. lm_width = 2048
  161. max_output_len = 10
  162.  
  163. params = init_params(num_enc_layers, num_dec_layers, num_heads, d_model, d_ff, vocab_size, lm_depth, lm_width)
  164.  
  165. sample_input = [[[random.gauss(0, 1) for _ in range(d_model)] for _ in range(5)]]
  166.  
  167. output = forward_pass(sample_input[0], num_enc_layers, num_dec_layers, num_heads, d_model, d_ff, vocab_size, lm_depth, lm_width, max_output_len, params)
  168.  
  169. print("Input shape: sequence_length x d_model")
  170. print(f"Output shape: {len(output)} x {len(output[0])} (max_output_len x vocab_size)")
  171. print(f"Sample output logits for first token: {output[0][:10]}...")
Advertisement
Add Comment
Please, Sign In to add comment