Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- import re
- import unicodedata
- def main():
- """
- Splits text files at specified points determined with regex and saves the splitted chunks
- of data as separate files. Names of the new files are determined by the line at
- which the split occurs.
- """
- """Configuration:"""
- source_file = "kissatkoiria.md"
- split_regex = "^# "
- encoding = "utf8"
- extension = ".md"
- p_obj = re.compile(split_regex)
- with open(source_file, encoding=encoding) as filehandle:
- chunk = ""
- splitting_line = ""
- chunk_number = 0
- for line in filehandle:
- if p_obj.search(line) is not None:
- if chunk:
- filename = get_filename(splitting_line)
- save_chunk(filename, chunk, chunk_number, extension)
- chunk = ""
- splitting_line = line
- chunk_number += 1
- chunk += line
- # Save the last chunk if needed
- if chunk:
- filename = get_filename(splitting_line)
- save_chunk(filename, chunk, chunk_number, extension)
- def save_chunk(filename: str, chunk: str, chunk_number: int, extension: str):
- if not filename:
- filename = f"unknown_chunk_{chunk_number}"
- complete_filename = f"{filename}{extension}"
- print(f"\n\n***** Saving chunk number {chunk_number} as {complete_filename}*****")
- with open(complete_filename, "w") as filehandle:
- filehandle.write(chunk)
- def get_filename(splitting_line: str) -> str:
- replaced = splitting_line.replace("# ", "") # Remove the beginning markdown header syntax
- stripped = replaced.strip()
- normalized = unicodedata.normalize("NFKD", stripped)
- filename = re.sub("[./,:;]+", "", normalized)
- return filename
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement