Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import requests
- import json
- import re
- import signal
- import time
- import ebooklib
- from ebooklib import epub
- import markdownify
- word_count_limit = 20000
- # Define the URL
- generateUrl = "http://localhost:5001/api/v1/generate"
- tokenCountUrl = "http://localhost:5001/api/extra/tokencount"
- # Define the headers for the request
- headers = {
- 'Content-Type': 'application/json'
- }
- def signal_handler(sig, frame):
- global running
- if (running):
- running = False
- signal.signal(signal.SIGINT, signal_handler)
- #Using Command-R model
- def getPromptJson(prompt, memory = None):
- data = {
- "n": 1,
- "max_context_length": 65536,
- "rep_pen": 1.1,
- "temperature": 0.7,
- "top_p": 0.92,
- "top_k": 100,
- "top_a": 0,
- "typical": 1,
- "tfs": 1,
- "rep_pen_range": 320,
- "rep_pen_slope": 0.7,
- "sampler_order": [6, 0, 1, 3, 4, 2, 5],
- #"memory": "[Summary: The song celebrates the remarkable attributes and hardworking nature of ants, etc.]\n",
- "trim_stop": True,
- "min_p": 0,
- "dynatemp_range": 0,
- "dynatemp_exponent": 1,
- "smoothing_factor": 0,
- "banned_tokens": [],
- "render_special": False,
- #"xtc_threshold": 0.15,
- #"xtc_probability": 0.5,
- "presence_penalty": 0,
- "logit_bias": {},
- "quiet": True,
- "use_default_badwordsids": False,
- "bypass_eos": False,
- "stop_sequence": ["<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"],
- "prompt": "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>" + prompt + "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
- "max_length": 1024,
- }
- if memory:
- data["memory"] = memory
- # if grammar:
- # data["grammar"] = grammar
- return json.dumps(data)
- def getResult(prompt):
- # Send the request and get the response
- try:
- response = requests.post(generateUrl, headers=headers, data=getPromptJson(prompt))
- except Exception as e:
- print(f"An error occurred: {e}")
- return False
- else: # Check if the request was successful
- if response.status_code == 200:
- # Parse the response JSON into a Python dictionary
- response_data = json.loads(response.text)
- #print(response_data)
- return response_data["results"][0]["text"]
- else:
- print(f"Request failed with status code {response.status_code}")
- return False
- #not used in this script
- def countTokens(prompt):
- response = requests.post(tokenCountUrl, headers=headers, data=json.dumps({"prompt":prompt}))
- if response.status_code == 200:
- # Parse the response JSON into a Python dictionary
- response_data = json.loads(response.text)
- print(response_data)
- def truncate_by_word_count(text, word_count_limit):
- """Truncates a text string by word count, preserving formatting.
- Args:
- text: The input text string.
- word_count_limit: The maximum number of words to include.
- Returns:
- The truncated text string.
- """
- truncated = False
- words_and_separators = re.split(r"(\s+)", text)
- truncated_words_and_separators = []
- word_count = 0
- for word_or_separator in words_and_separators:
- if word_or_separator.strip(): # Check if it's a word
- word_count += 1
- truncated_words_and_separators.append(word_or_separator)
- if word_count >= word_count_limit:
- truncated = True
- break
- truncated_text = ''.join(truncated_words_and_separators)
- return truncated_text, truncated
- def extract_epub_data(epub_file_path, word_count_limit):
- """Extracts text, title, and author from an EPUB file.
- Args:
- epub_file_path: The path to the EPUB file.
- word_count_limit: The maximum number of words to include in the extracted text.
- Returns:
- A tuple containing the extracted text, title, and author.
- """
- book = epub.read_epub(epub_file_path)
- # Extract title and author from metadata (might need adjustment depending on the library)
- title = book.get_metadata("DC", 'title') # Adjust based on your library's method
- author = book.get_metadata("DC", 'creator') # Adjust based on your library's method
- # Extract text content
- text = ''
- items = list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT))
- for item in items[1:]: # Skip the first item
- text += markdownify.markdownify(item.get_body_content().decode("utf-8"), escape_misc=False) + '\n\n'
- extra_breaks = r"\n{3,}" # Matches three or more consecutive newlines
- two_breaks = "\n\n" # Replace with two newlines
- text = re.sub(extra_breaks, two_breaks, text)
- truncate_result = truncate_by_word_count(text, word_count_limit)
- return {"text":truncate_result[0], "title":title[0][0], "author":author[0][0], "truncated":truncate_result[1]}
- def review_files(directory):
- reviews = None
- try:
- with open("reviews.json", 'r') as f:
- reviews = json.load(f)
- except:
- reviews = {}
- for root, dirs, files in os.walk(directory):
- for file in files:
- if not running:
- print("Interrupted.")
- with open("reviews.json", "w") as writefile:
- json.dump(reviews, writefile, indent=4, sort_keys=True)
- return
- if file.endswith(".epub") and not (file in reviews):
- start_time = time.time()
- epub_file_path = os.path.join(root, file)
- result = extract_epub_data(epub_file_path, word_count_limit)
- if result["truncated"]:
- prompt = f'The following is an excerpt from a work of fiction titled {result["title"]}.\n\n{result["text"]}\n\n### End of story excerpt.'
- else:
- prompt = f'The following is a work of fiction titled {result["title"]}.\n\n{result["text"]}\n\n### End of story.'
- prompt = prompt + (
- "\n\nI am trying to identify stories that feature"
- #TODO: Insert a description of the kinds of stories you're interested in finding. Feel free
- #to be wordy here, we're already feeding thousands of words of story into the AI so a few extra lines
- #here shouldn't burden the AI much more than that.
- " Please review this story and tell me whether it has any content that matches those criteria,"
- " giving a brief summary of it if there is any.")
- print(f'Reviewing {result["title"]} by {result["author"]}\n')
- review = getResult(prompt)
- reviews[file] = {"title":result["title"], "author": result["author"], "review":review}
- print(f'{review}\n')
- with open("reviews.json", "w") as writefile:
- json.dump(reviews, writefile, indent=4, sort_keys=True)
- end_time = time.time()
- elapsed_time = end_time - start_time
- minutes = int(elapsed_time // 60)
- seconds = int(elapsed_time % 60)
- print(f"Review took {minutes} minutes and {seconds} seconds.\n----\n")
- running = True
- review_files('.')
Advertisement
Add Comment
Please, Sign In to add comment