mti

Basic markdown file splitter in Python

mti
Jul 29th, 2021
1,018
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python3
  2.  
  3. import re
  4. import unicodedata
  5.  
  6. def main():
  7.     """
  8.    Splits text files at specified points determined with regex and saves the splitted chunks
  9.    of data as separate files. Names of the new files are determined by the line at
  10.    which the split occurs.
  11.    """
  12.  
  13.     """Configuration:"""
  14.     source_file = "kissatkoiria.md"
  15.     split_regex = "^# "
  16.     encoding = "utf8"
  17.     extension = ".md"
  18.  
  19.     p_obj = re.compile(split_regex)
  20.  
  21.     with open(source_file, encoding=encoding) as filehandle:
  22.         chunk = ""
  23.         splitting_line = ""
  24.         chunk_number = 0
  25.  
  26.         for line in filehandle:
  27.             if p_obj.search(line) is not None:
  28.                 if chunk:
  29.                     filename = get_filename(splitting_line)
  30.                     save_chunk(filename, chunk, chunk_number, extension)
  31.                 chunk = ""
  32.                 splitting_line = line
  33.                 chunk_number += 1
  34.             chunk += line
  35.  
  36.         # Save the last chunk if needed
  37.         if chunk:
  38.             filename = get_filename(splitting_line)
  39.             save_chunk(filename, chunk, chunk_number, extension)
  40.  
  41.  
  42. def save_chunk(filename: str, chunk: str, chunk_number: int, extension: str):
  43.     if not filename:
  44.         filename = f"unknown_chunk_{chunk_number}"
  45.     complete_filename = f"{filename}{extension}"
  46.  
  47.     print(f"\n\n***** Saving chunk number {chunk_number} as {complete_filename}*****")
  48.    
  49.     with open(complete_filename, "w") as filehandle:
  50.         filehandle.write(chunk)
  51.  
  52. def get_filename(splitting_line: str) -> str:
  53.     replaced = splitting_line.replace("# ", "") # Remove the beginning markdown header syntax
  54.     stripped = replaced.strip()
  55.     normalized = unicodedata.normalize("NFKD", stripped)
  56.     filename = re.sub("[./,:;]+", "", normalized)
  57.     return filename
  58.  
  59.  
  60. if __name__ == "__main__":
  61.     main()
  62.  
RAW Paste Data