Advertisement
xosski

LLM Word Processing

Jun 2nd, 2025
15
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.72 KB | None | 0 0
  1. import json
  2. import re
  3. import contractions
  4. from pathlib import Path
  5. from docx import Document
  6. from spellchecker import SpellChecker
  7. from sentence_transformers import SentenceTransformer
  8.  
  9. # === Core Text I/O ===
  10. def read_file(file_path):
  11. ext = Path(file_path).suffix.lower()
  12. if ext == '.txt':
  13. return Path(file_path).read_text(encoding='utf-8')
  14. elif ext == '.docx':
  15. doc = Document(file_path)
  16. return "\n".join(p.text for p in doc.paragraphs)
  17. elif ext == '.md':
  18. return Path(file_path).read_text(encoding='utf-8')
  19. else:
  20. raise ValueError(f"Unsupported format: {ext}")
  21.  
  22. # === Chunking ===
  23. def chunk_text(text, max_words=500, overlap=50):
  24. words = text.split()
  25. chunks = []
  26. start = 0
  27. while start < len(words):
  28. end = min(start + max_words, len(words))
  29. chunk = " ".join(words[start:end])
  30. chunks.append(chunk)
  31. start = end - overlap
  32. return chunks
  33.  
  34. # === Spell Correction ===
  35. def correct_spelling(text):
  36. spell = SpellChecker()
  37. words = text.split()
  38. corrected = [spell.correction(word) or word for word in words]
  39. return ' '.join(corrected)
  40.  
  41. # === LLM-Oriented Cleaning ===
  42. def preprocess_for_llm(text):
  43. text = contractions.fix(text)
  44. text = text.lower()
  45. text = re.sub(r'[^a-z\s]', '', text)
  46. text = re.sub(r'\s+', ' ', text).strip()
  47. return correct_spelling(text)
  48.  
  49. # === JSONL Export ===
  50. def export_to_jsonl(chunks, output_path, mode="instruct"):
  51. with open(output_path, 'w', encoding='utf-8') as f:
  52. for chunk in chunks:
  53. data = {
  54. "prompt": f"Summarize the following:\n\n{chunk}",
  55. "completion": "" if mode == "instruct" else None
  56. } if mode == "instruct" else {"text": chunk}
  57. f.write(json.dumps(data) + "\n")
  58.  
  59. # === Optional Embedding ===
  60. def embed_chunks(chunks, model_name='all-MiniLM-L6-v2'):
  61. model = SentenceTransformer(model_name)
  62. return model.encode(chunks)
  63.  
  64. # === CLI-Like Runner ===
  65. def run_pipeline(input_path, output_jsonl, embed=False):
  66. print(f"๐Ÿ“‚ Reading: {input_path}")
  67. text = read_file(input_path)
  68. print(f"๐Ÿงน Preprocessing text...")
  69. preprocessed = preprocess_for_llm(text)
  70. print(f"๐Ÿ“š Chunking...")
  71. chunks = chunk_text(preprocessed)
  72. print(f"๐Ÿ“ Exporting to: {output_jsonl}")
  73. export_to_jsonl(chunks, output_jsonl)
  74. if embed:
  75. print("๐Ÿ” Generating embeddings...")
  76. vectors = embed_chunks(chunks)
  77. print(f"โœ… Generated {len(vectors)} vectors.")
  78. return vectors
  79. print("โœ… Pipeline complete.")
  80.  
  81. # Example use
  82. if __name__ == "__main__":
  83. run_pipeline("my_file.docx", "formatted_output.jsonl", embed=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement