Advertisement
Python253

split_data_w_markers

Mar 1st, 2024 (edited)
716
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.67 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. # Filename: split_data_w_markers.py
  3. # Author: Jeoi Reqi
  4. # Split Data With Markers is a simple script to be used along with the Data Marker Python Script
  5. # Script: https://pastebin.com/VyfFfZiN
  6.  
  7. import os
  8.  
  9. def split_file(input_file, output_directory, lines_per_chunk=100):
  10.     with open(input_file, 'r', encoding='utf-8') as file:
  11.         lines = file.readlines()
  12.  
  13.     total_lines = len(lines)
  14.     total_chunks = (total_lines // lines_per_chunk) + 1  # Adjust for the last chunk
  15.  
  16.     print(f"Total number of chunks: {total_chunks}")
  17.  
  18.     # Calculate the number of digits needed for naming files
  19.     num_digits = len(str(total_chunks))
  20.  
  21.     for i in range(total_chunks):
  22.         start_index = i * lines_per_chunk
  23.         end_index = min((i + 1) * lines_per_chunk, total_lines)
  24.  
  25.         chunk = lines[start_index:end_index]
  26.  
  27.         # Format the chunk number with leading zeros
  28.         chunk_number = str(i + 1).zfill(num_digits)
  29.  
  30.         output_file_path = os.path.join(output_directory, f'chunk_{chunk_number}.txt')
  31.  
  32.         with open(output_file_path, 'w', encoding='utf-8') as output:
  33.             output.writelines(chunk)
  34.  
  35.     print("File split complete.")
  36.  
  37. if __name__ == "__main__":
  38.     current_directory = os.getcwd()
  39.     input_file_path = os.path.join(current_directory, 'isoon_chat.txt')   # Edit Input Name To Your Desired File Name
  40.     output_directory_path = os.path.join(current_directory, 'split_isoon_chat_data_w_markers')  # Edit Output File Name To Your Desired File Name
  41.  
  42.     os.makedirs(output_directory_path, exist_ok=True)
  43.  
  44.     split_file(input_file_path, output_directory_path, lines_per_chunk=100)  # Split At The Markers
  45.  
  46.  
  47.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement