Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # finetune_reasoning.py
- import math
- import random
- def matmul(a, b):
- rows_a, cols_a = len(a), len(a[0])
- rows_b, cols_b = len(b), len(b[0])
- result = [[sum(a[i][k] * b[k][j] for k in range(cols_a)) for j in range(cols_b)] for i in range(rows_a)]
- return result
- def transpose(matrix):
- return [[matrix[j][i] for j in range(len(matrix))] for i in range(len(matrix[0]))]
- def softmax(x):
- exp_x = [math.exp(i - max(x)) for i in x]
- sum_exp = sum(exp_x)
- return [i / sum_exp for i in exp_x]
- def layer_norm(x, eps=1e-5):
- mean = sum(x) / len(x)
- variance = sum((i - mean) ** 2 for i in x) / len(x)
- return [(i - mean) / math.sqrt(variance + eps) for i in x]
- def gelu(x):
- return 0.5 * x * (1 + math.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * x ** 3)))
- def attention(q, k, v, d_k):
- scores = [[sum(q[i][m] * k[j][m] for m in range(len(q[0]))) / math.sqrt(d_k) for j in range(len(k))] for i in range(len(q))]
- attn_weights = [softmax(row) for row in scores]
- output = [[sum(attn_weights[i][j] * v[j][m] for j in range(len(v))) for m in range(len(v[0]))] for i in range(len(attn_weights))]
- return output
- def multi_head_attention(x, num_heads, d_model, wq, wk, wv, wo):
- d_k = d_model // num_heads
- q = matmul(x, wq)
- k = matmul(x, wk)
- v = matmul(x, wv)
- seq_len = len(x)
- q_heads = [[[q[i][h * d_k + j] for j in range(d_k)] for i in range(seq_len)] for h in range(num_heads)]
- k_heads = [[[k[i][h * d_k + j] for j in range(d_k)] for i in range(seq_len)] for h in range(num_heads)]
- v_heads = [[[v[i][h * d_k + j] for j in range(d_k)] for i in range(seq_len)] for h in range(num_heads)]
- attn_outputs = [attention(q_heads[h], k_heads[h], v_heads[h], d_k) for h in range(num_heads)]
- concat = [[attn_outputs[h][i][j] for h in range(num_heads) for j in range(d_k)] for i in range(seq_len)]
- output = matmul(concat, wo)
- return output
- def feed_forward(x, w1, b1, w2, b2):
- hidden = [[gelu(sum(x[i][j] * w1[j][k] for j in range(len(x[0]))) + b1[k]) for k in range(len(w1[0]))] for i in range(len(x))]
- output = [[sum(hidden[i][j] * w2[j][k] for j in range(len(hidden[0]))) + b2[k] for k in range(len(w2[0]))] for i in range(len(hidden))]
- return output
- def transformer_block(x, num_heads, d_model, d_ff, wq, wk, wv, wo, w1, b1, w2, b2):
- attn_out = multi_head_attention(x, num_heads, d_model, wq, wk, wv, wo)
- x = [[x[i][j] + attn_out[i][j] for j in range(len(x[0]))] for i in range(len(x))]
- x = [layer_norm(row) for row in x]
- ff_out = feed_forward(x, w1, b1, w2, b2)
- x = [[x[i][j] + ff_out[i][j] for j in range(len(x[0]))] for i in range(len(x))]
- x = [layer_norm(row) for row in x]
- return x
- def encoder(tokens, num_layers, num_heads, d_model, d_ff, params):
- x = tokens
- for layer in range(num_layers):
- x = transformer_block(x, num_heads, d_model, d_ff,
- params[f'enc_l{layer}_wq'], params[f'enc_l{layer}_wk'],
- params[f'enc_l{layer}_wv'], params[f'enc_l{layer}_wo'],
- params[f'enc_l{layer}_w1'], params[f'enc_l{layer}_b1'],
- params[f'enc_l{layer}_w2'], params[f'enc_l{layer}_b2'])
- pooled = [sum(x[i][j] for i in range(len(x))) / len(x) for j in range(d_model)]
- latent = [sum(pooled[j] * params['enc_proj'][j][k] for j in range(len(pooled))) for k in range(1024)]
- return latent
- def latent_manipulator(latent, depth, width, params):
- x = latent[:]
- for layer in range(depth):
- residual = x[:]
- hidden = [gelu(sum(x[j] * params[f'lm_l{layer}_w1'][j][k] for j in range(len(x))) + params[f'lm_l{layer}_b1'][k]) for k in range(width)]
- x = [sum(hidden[j] * params[f'lm_l{layer}_w2'][j][k] for j in range(len(hidden))) + params[f'lm_l{layer}_b2'][k] for k in range(len(x))]
- x = [x[i] + residual[i] for i in range(len(x))]
- x = layer_norm(x)
- return x
- def decoder(latent, max_len, num_layers, num_heads, d_model, d_ff, vocab_size, params):
- x = [[latent[j] * params['dec_proj'][j][k] for k in range(d_model)] for _ in range(max_len)]
- for layer in range(num_layers):
- x = transformer_block(x, num_heads, d_model, d_ff,
- params[f'dec_l{layer}_wq'], params[f'dec_l{layer}_wk'],
- params[f'dec_l{layer}_wv'], params[f'dec_l{layer}_wo'],
- params[f'dec_l{layer}_w1'], params[f'dec_l{layer}_b1'],
- params[f'dec_l{layer}_w2'], params[f'dec_l{layer}_b2'])
- logits = [[sum(x[i][j] * params['output_proj'][j][k] for j in range(d_model)) for k in range(vocab_size)] for i in range(len(x))]
- return logits
- def init_params(num_enc_layers, num_dec_layers, num_heads, d_model, d_ff, vocab_size, lm_depth, lm_width):
- params = {}
- for layer in range(num_enc_layers):
- params[f'enc_l{layer}_wq'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
- params[f'enc_l{layer}_wk'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
- params[f'enc_l{layer}_wv'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
- params[f'enc_l{layer}_wo'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
- params[f'enc_l{layer}_w1'] = [[random.gauss(0, 0.02) for _ in range(d_ff)] for _ in range(d_model)]
- params[f'enc_l{layer}_b1'] = [0.0] * d_ff
- params[f'enc_l{layer}_w2'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_ff)]
- params[f'enc_l{layer}_b2'] = [0.0] * d_model
- params['enc_proj'] = [[random.gauss(0, 0.02) for _ in range(1024)] for _ in range(d_model)]
- for layer in range(lm_depth):
- params[f'lm_l{layer}_w1'] = [[random.gauss(0, 0.02) for _ in range(lm_width)] for _ in range(1024)]
- params[f'lm_l{layer}_b1'] = [0.0] * lm_width
- params[f'lm_l{layer}_w2'] = [[random.gauss(0, 0.02) for _ in range(1024)] for _ in range(lm_width)]
- params[f'lm_l{layer}_b2'] = [0.0] * 1024
- params['dec_proj'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(1024)]
- for layer in range(num_dec_layers):
- params[f'dec_l{layer}_wq'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
- params[f'dec_l{layer}_wk'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
- params[f'dec_l{layer}_wv'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
- params[f'dec_l{layer}_wo'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_model)]
- params[f'dec_l{layer}_w1'] = [[random.gauss(0, 0.02) for _ in range(d_ff)] for _ in range(d_model)]
- params[f'dec_l{layer}_b1'] = [0.0] * d_ff
- params[f'dec_l{layer}_w2'] = [[random.gauss(0, 0.02) for _ in range(d_model)] for _ in range(d_ff)]
- params[f'dec_l{layer}_b2'] = [0.0] * d_model
- params['output_proj'] = [[random.gauss(0, 0.02) for _ in range(vocab_size)] for _ in range(d_model)]
- return params
- def forward_pass(input_tokens, num_enc_layers, num_dec_layers, num_heads, d_model, d_ff, vocab_size, lm_depth, lm_width, max_output_len, params):
- question_latent = encoder(input_tokens, num_enc_layers, num_heads, d_model, d_ff, params)
- answer_latent = latent_manipulator(question_latent, lm_depth, lm_width, params)
- output_logits = decoder(answer_latent, max_output_len, num_dec_layers, num_heads, d_model, d_ff, vocab_size, params)
- return output_logits
- num_enc_layers = 4
- num_dec_layers = 4
- num_heads = 8
- d_model = 256
- d_ff = 1024
- vocab_size = 5000
- lm_depth = 8
- lm_width = 2048
- max_output_len = 10
- params = init_params(num_enc_layers, num_dec_layers, num_heads, d_model, d_ff, vocab_size, lm_depth, lm_width)
- sample_input = [[[random.gauss(0, 1) for _ in range(d_model)] for _ in range(5)]]
- output = forward_pass(sample_input[0], num_enc_layers, num_dec_layers, num_heads, d_model, d_ff, vocab_size, lm_depth, lm_width, max_output_len, params)
- print("Input shape: sequence_length x d_model")
- print(f"Output shape: {len(output)} x {len(output[0])} (max_output_len x vocab_size)")
- print(f"Sample output logits for first token: {output[0][:10]}...")
Advertisement
Add Comment
Please, Sign In to add comment