FaceDeer

Having an AI read books so I don't have to

Oct 18th, 2024 (edited)
213
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.40 KB | Source Code | 0 0
  1. import os
  2. import requests
  3. import json
  4. import re
  5. import signal
  6. import time
  7.  
  8. import ebooklib
  9. from ebooklib import epub
  10. import markdownify
  11.  
  12. word_count_limit = 20000
  13.  
  14. # Define the URL
  15. generateUrl = "http://localhost:5001/api/v1/generate"
  16. tokenCountUrl = "http://localhost:5001/api/extra/tokencount"
  17.  
  18. # Define the headers for the request
  19. headers = {
  20.     'Content-Type': 'application/json'
  21. }
  22.  
  23. def signal_handler(sig, frame):
  24.     global running
  25.     if (running):
  26.         running = False
  27. signal.signal(signal.SIGINT, signal_handler)
  28.  
  29. #Using Command-R model
  30. def getPromptJson(prompt, memory = None):
  31.     data = {
  32.         "n": 1,
  33.         "max_context_length": 65536,
  34.         "rep_pen": 1.1,
  35.         "temperature": 0.7,
  36.         "top_p": 0.92,
  37.         "top_k": 100,
  38.         "top_a": 0,
  39.         "typical": 1,
  40.         "tfs": 1,
  41.         "rep_pen_range": 320,
  42.         "rep_pen_slope": 0.7,
  43.         "sampler_order": [6, 0, 1, 3, 4, 2, 5],
  44.         #"memory": "[Summary: The song celebrates the remarkable attributes and hardworking nature of ants, etc.]\n",
  45.         "trim_stop": True,
  46.         "min_p": 0,
  47.         "dynatemp_range": 0,
  48.         "dynatemp_exponent": 1,
  49.         "smoothing_factor": 0,
  50.         "banned_tokens": [],
  51.         "render_special": False,
  52.         #"xtc_threshold": 0.15,
  53.         #"xtc_probability": 0.5,
  54.         "presence_penalty": 0,
  55.         "logit_bias": {},
  56.         "quiet": True,
  57.         "use_default_badwordsids": False,
  58.         "bypass_eos": False,
  59.         "stop_sequence": ["<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"],
  60.         "prompt": "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>" + prompt + "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
  61.         "max_length": 1024,
  62.     }
  63.     if memory:
  64.         data["memory"] = memory
  65. #    if grammar:
  66. #        data["grammar"] = grammar
  67.     return json.dumps(data)
  68.  
  69. def getResult(prompt):
  70.     # Send the request and get the response
  71.     try:
  72.         response = requests.post(generateUrl, headers=headers, data=getPromptJson(prompt))
  73.     except Exception as e:
  74.           print(f"An error occurred: {e}")
  75.           return False
  76.     else:    # Check if the request was successful
  77.         if response.status_code == 200:
  78.             # Parse the response JSON into a Python dictionary
  79.             response_data = json.loads(response.text)
  80.             #print(response_data)
  81.             return response_data["results"][0]["text"]
  82.         else:
  83.             print(f"Request failed with status code {response.status_code}")
  84.             return False
  85.  
  86. #not used in this script
  87. def countTokens(prompt):
  88.     response = requests.post(tokenCountUrl, headers=headers, data=json.dumps({"prompt":prompt}))
  89.     if response.status_code == 200:
  90.         # Parse the response JSON into a Python dictionary
  91.         response_data = json.loads(response.text)
  92.         print(response_data)
  93.  
  94. def truncate_by_word_count(text, word_count_limit):
  95.     """Truncates a text string by word count, preserving formatting.
  96.  
  97.    Args:
  98.        text: The input text string.
  99.        word_count_limit: The maximum number of words to include.
  100.  
  101.    Returns:
  102.        The truncated text string.
  103.    """
  104.  
  105.     truncated = False
  106.     words_and_separators = re.split(r"(\s+)", text)
  107.     truncated_words_and_separators = []
  108.     word_count = 0
  109.     for word_or_separator in words_and_separators:
  110.         if word_or_separator.strip():  # Check if it's a word
  111.             word_count += 1
  112.         truncated_words_and_separators.append(word_or_separator)
  113.         if word_count >= word_count_limit:
  114.             truncated = True
  115.             break
  116.  
  117.     truncated_text = ''.join(truncated_words_and_separators)
  118.     return truncated_text, truncated
  119.  
  120.  
  121. def extract_epub_data(epub_file_path, word_count_limit):
  122.     """Extracts text, title, and author from an EPUB file.
  123.  
  124.    Args:
  125.        epub_file_path: The path to the EPUB file.
  126.        word_count_limit: The maximum number of words to include in the extracted text.
  127.  
  128.    Returns:
  129.        A tuple containing the extracted text, title, and author.
  130.    """
  131.  
  132.     book = epub.read_epub(epub_file_path)
  133.  
  134.     # Extract title and author from metadata (might need adjustment depending on the library)
  135.     title = book.get_metadata("DC", 'title')  # Adjust based on your library's method
  136.     author = book.get_metadata("DC", 'creator')  # Adjust based on your library's method
  137.  
  138.     # Extract text content
  139.     text = ''
  140.     items = list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT))
  141.     for item in items[1:]:  # Skip the first item
  142.         text += markdownify.markdownify(item.get_body_content().decode("utf-8"), escape_misc=False) + '\n\n'
  143.     extra_breaks = r"\n{3,}"  # Matches three or more consecutive newlines
  144.     two_breaks = "\n\n"  # Replace with two newlines
  145.     text = re.sub(extra_breaks, two_breaks, text)
  146.     truncate_result = truncate_by_word_count(text, word_count_limit)
  147.    
  148.     return {"text":truncate_result[0], "title":title[0][0], "author":author[0][0], "truncated":truncate_result[1]}
  149.  
  150. def review_files(directory):
  151.     reviews = None
  152.     try:
  153.         with open("reviews.json", 'r') as f:
  154.             reviews = json.load(f)
  155.     except:
  156.         reviews = {}
  157.  
  158.     for root, dirs, files in os.walk(directory):
  159.         for file in files:
  160.             if not running:
  161.                 print("Interrupted.")
  162.                 with open("reviews.json", "w") as writefile:
  163.                     json.dump(reviews, writefile, indent=4, sort_keys=True)
  164.                 return
  165.             if file.endswith(".epub") and not (file in reviews):
  166.                 start_time = time.time()
  167.                 epub_file_path = os.path.join(root, file)
  168.                 result = extract_epub_data(epub_file_path, word_count_limit)
  169.                 if result["truncated"]:
  170.                     prompt = f'The following is an excerpt from a work of fiction titled {result["title"]}.\n\n{result["text"]}\n\n### End of story excerpt.'
  171.                 else:
  172.                     prompt = f'The following is a work of fiction titled {result["title"]}.\n\n{result["text"]}\n\n### End of story.'
  173.  
  174.                 prompt = prompt + (
  175.                     "\n\nI am trying to identify stories that feature"
  176.                     #TODO: Insert a description of the kinds of stories you're interested in finding. Feel free
  177.                     #to be wordy here, we're already feeding thousands of words of story into the AI so a few extra lines
  178.                     #here shouldn't burden the AI much more than that.
  179.                     " Please review this story and tell me whether it has any content that matches those criteria,"
  180.                     " giving a brief summary of it if there is any.")
  181.  
  182.                 print(f'Reviewing {result["title"]} by {result["author"]}\n')
  183.                 review = getResult(prompt)
  184.                 reviews[file] = {"title":result["title"], "author": result["author"], "review":review}
  185.                 print(f'{review}\n')
  186.                 with open("reviews.json", "w") as writefile:
  187.                     json.dump(reviews, writefile, indent=4, sort_keys=True)
  188.  
  189.                 end_time = time.time()
  190.                 elapsed_time = end_time - start_time
  191.                 minutes = int(elapsed_time // 60)
  192.                 seconds = int(elapsed_time % 60)
  193.                 print(f"Review took {minutes} minutes and {seconds} seconds.\n----\n")
  194.  
  195. running = True
  196. review_files('.')
Tags: ai Ebooks
Advertisement
Add Comment
Please, Sign In to add comment