Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import openai
- import transformers
- import math
- import random
- import collections
- openai.api_key = '<your_api_key>'
- # you can get this from http://mattmahoney.net/dc/enwik8.zip
- enwikxmlfile = '/path/to/enwik8.xml'
- enwikxml = open(enwikxmlfile, 'r').read()
- tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
- enwikxmltoks = tokenizer.encode(enwikxml)
- def build_markov(tokens, depth):
- counts = {}
- path = [None for i in range(depth + 1)]
- for token in tokens:
- curr = counts
- path = (path + [token])[1:]
- if None in path:
- continue
- for x in path[:-1]:
- if x not in curr:
- curr[x] = {}
- curr = curr[x]
- if token not in curr:
- curr[token] = 0
- curr[token] += 1
- return counts
- def compute_markov_logprobs(model, depth, seq, last_n):
- logprobs = []
- path = seq[:depth+1]
- for token in seq[depth+1:]:
- path = (path + [token])[1:]
- curr = model
- for x in path[:-1]:
- curr = curr[x]
- tot_count = sum(curr.values()) + 1 # somthing something laplace
- logprobs.append(math.log(curr[token] / tot_count))
- return logprobs[-last_n:]
- def compute_oa_logprobs(model_name, tokens, last_n):
- completion = openai.Completion.create(
- model=model_name,
- logprobs=1,
- max_tokens=0,
- echo=True,
- prompt=tokenizer.decode(tokens)
- )
- return completion.choices[0].logprobs.token_logprobs[-last_n:]
- markov_chars_0 = build_markov(enwikxml, 0)
- markov_chars_1 = build_markov(enwikxml, 1)
- markov_chars_2 = build_markov(enwikxml, 2)
- markov_chars_3 = build_markov(enwikxml, 3)
- markov_toks_0 = build_markov(enwikxmltoks, 0)
- markov_toks_1 = build_markov(enwikxmltoks, 1)
- markov_toks_2 = build_markov(enwikxmltoks, 2)
- markov_toks_3 = build_markov(enwikxmltoks, 3)
- def record_trial(bits_per_char, model_name, get_tot_logprob, txt_len):
- try:
- tot_logprob = get_tot_logprob()
- bit_cost_per_char = -tot_logprob / math.log(2) / txt_len
- except Exception as e:
- print(e)
- bit_cost_per_char = None
- if model_name not in bits_per_char:
- bits_per_char[model_name] = []
- bits_per_char[model_name].append(bit_cost_per_char)
- oa_model_names = [
- 'ada',
- 'babbage',
- 'curie',
- 'davinci',
- 'text-ada-001',
- 'text-babbage-001',
- 'text-curie-001',
- 'text-davinci-002',
- 'text-davinci-003',
- ]
- # results of 10 trials of looking at the bits-per-char for each model
- bits_per_char = {
- }
- samples = []
- for i in range(3):
- sample_length = 2048
- sample_offset = int(random.random() * (len(enwikxmltoks) - sample_length))
- sample_toks = enwikxmltoks[sample_offset:sample_offset+sample_length]
- sample_text = tokenizer.decode(sample_toks)
- txt_len = len(tokenizer.decode(sample_toks[-1024:]))
- samples.append(sample_text)
- record_trial(bits_per_char, 'markov_chars_0', lambda: sum(compute_markov_logprobs(markov_chars_0, 0, list(sample_text), txt_len)), txt_len)
- record_trial(bits_per_char, 'markov_chars_1', lambda: sum(compute_markov_logprobs(markov_chars_1, 1, list(sample_text), txt_len)), txt_len)
- record_trial(bits_per_char, 'markov_chars_2', lambda: sum(compute_markov_logprobs(markov_chars_2, 2, list(sample_text), txt_len)), txt_len)
- record_trial(bits_per_char, 'markov_chars_3', lambda: sum(compute_markov_logprobs(markov_chars_3, 3, list(sample_text), txt_len)), txt_len)
- record_trial(bits_per_char, 'markov_toks_0', lambda: sum(compute_markov_logprobs(markov_toks_0, 0, sample_toks, 1024)), txt_len)
- record_trial(bits_per_char, 'markov_toks_1', lambda: sum(compute_markov_logprobs(markov_toks_1, 1, sample_toks, 1024)), txt_len)
- record_trial(bits_per_char, 'markov_toks_2', lambda: sum(compute_markov_logprobs(markov_toks_2, 2, sample_toks, 1024)), txt_len)
- record_trial(bits_per_char, 'markov_toks_3', lambda: sum(compute_markov_logprobs(markov_toks_3, 3, sample_toks, 1024)), txt_len)
- for model_name in oa_model_names:
- record_trial(bits_per_char, 'openai:' + model_name, lambda: sum(compute_oa_logprobs(model_name, sample_toks, 1024)), txt_len)
- print(f' {"Model Name:":24s} Bits per char (stddev)')
- for model_name, results in bits_per_char.items():
- mean = sum(results) / len(results)
- variance = sum([(x - mean)**2 for x in results])
- stddev = math.sqrt(variance / len(results))
- print(f' {model_name+":":24s}: {mean:.2f} ({stddev:.2f})')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement