Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
- from datasets import load_dataset
- from sentence_transformers import SentenceTransformer
- from sklearn.metrics.pairwise import cosine_similarity
- import torch
- loaded_dataset = load_dataset("squad_v2")
- dataset = [item['question'] for item in loaded_dataset['train']]
- def find_similar_texts(query, dataset, model, top_n=3):
- dataset_embeddings = model.encode(dataset)
- query_embedding = model.encode([query])
- similarities = cosine_similarity(query_embedding, dataset_embeddings).flatten()
- top_n_indices = similarities.argsort()[-top_n:][::-1]
- return [(dataset[i], similarities[i]) for i in top_n_indices]
- def enhance_prompt_with_retrieved_text(query, dataset, model, top_n=3):
- similar_texts = find_similar_texts(query, dataset, model, top_n)
- retrieved_texts = "\n".join(f"Relevant code {i+1}: {text}" for i, (text, _) in enumerate(similar_texts))
- enhanced_prompt = f"User query: {query}\n\n" \
- f"Relevant to user query chunks:\n{retrieved_texts}\n\n" \
- f"Answer for user query, please"
- return enhanced_prompt
- if __name__ == "__main__":
- model_name = "bigscience/bloom-560m"
- tokenizer = AutoTokenizer.from_pretrained(model_name)
- model = AutoModelForCausalLM.from_pretrained(model_name)
- generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
- embed_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
- user_query = "how to make money on cryptocurrency?"
- promt_with_context = enhance_prompt_with_retrieved_text(user_query, dataset, embed_model)
- print(promt_with_context)
- output = generator(
- promt_with_context,
- max_length=2048,
- num_return_sequences=1,
- temperature=0.7,
- top_p=0.9,
- repetition_penalty=1.2
- )
- print(output[0]['generated_text'])
Advertisement
Add Comment
Please, Sign In to add comment